knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width=6, fig.height=4) 

options(scipen = 999)  ## disable scientific notation 4.835312e-04 --> 0.0004835312

# install.packages("alr3")  ## note - you only have to do this once
library(alr3)
library(ggplot2)
library(dplyr)
library(tidyr)
library(broom)
library(splines)
library(gridExtra)
library(reshape)
library(glmnet)
library(MASS)
library(mgcv)
library(lme4)
library(gee)
library(readr)
library(boot)
library(SemiPar) # get data(libar)

0.0.1 Intro to regression

####################################################################

## clear work space
rm(list = ls())

## load heights data

data(heights)

####################################################################
## initial data looking
####################################################################

## some summary looks at the height data set
head(heights)
##   Mheight Dheight
## 1    59.7    55.1
## 2    58.2    56.5
## 3    60.6    56.0
## 4    60.7    56.8
## 5    61.8    56.0
## 6    55.5    57.9
heights = tbl_df(heights)
heights
## # A tibble: 1,375 x 2
##    Mheight Dheight
##      <dbl>   <dbl>
##  1    59.7    55.1
##  2    58.2    56.5
##  3    60.6    56  
##  4    60.7    56.8
##  5    61.8    56  
##  6    55.5    57.9
##  7    55.4    57.1
##  8    56.8    57.6
##  9    57.5    57.2
## 10    57.3    57.1
## # ... with 1,365 more rows
dplyr::glimpse(heights)
## Observations: 1,375
## Variables: 2
## $ Mheight <dbl> 59.7, 58.2, 60.6, 60.7, 61.8, 55.5, 55.4, 56.8, 57.5, ...
## $ Dheight <dbl> 55.1, 56.5, 56.0, 56.8, 56.0, 57.9, 57.1, 57.6, 57.2, ...
str(heights)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1375 obs. of  2 variables:
##  $ Mheight: num  59.7 58.2 60.6 60.7 61.8 55.5 55.4 56.8 57.5 57.3 ...
##  $ Dheight: num  55.1 56.5 56 56.8 56 57.9 57.1 57.6 57.2 57.1 ...
dim(heights)
## [1] 1375    2
## look at daughters' heights only
summary(heights[,2])
##     Dheight     
##  Min.   :55.10  
##  1st Qu.:62.00  
##  Median :63.60  
##  Mean   :63.75  
##  3rd Qu.:65.60  
##  Max.   :73.10
ggplot(heights, aes(x = Dheight)) + 
  geom_histogram(fill = "#0000FF", alpha = .5, color = "black") +
  labs(x = "Daughter Height", y = "Count", title = "Histogram of Daughter Heights")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## plot daughter height against mother height
ggplot(heights, aes(x = Mheight, y = Dheight)) + 
  geom_point(color = "red") +
  labs(x = "Mother Height", y = "Daughter Height")

####################################################################
## some "regression" analyses
####################################################################

plot0 = ggplot(heights, aes(x = Mheight, y = Dheight)) + 
  geom_point(color = "red") +
  labs(x = "Mother Height", y = "Daughter Height")

## order predictors and outcome
heights = arrange(heights, Mheight)
x = heights$Mheight
y = heights$Dheight

## f(x) = E(y)
fx = mean(y)

plot1 = plot0 + geom_hline(aes(yintercept = fx), color = "blue", size =1.2)
plot1

## interpolation
fx = y

plot2 = plot0 + geom_path(aes(x = x, y = fx), color = "blue")
plot2

## bin means
x.bin = as.numeric(cut(x, breaks = 5))
fx = sapply(x.bin, function(u) mean(y[which(x.bin == u)]))

plot3 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot3

## smooth curve 1
fit.loess = loess(y ~ x, span = .15)
fx = predict(fit.loess, data.frame(x = x))

plot4 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot4

## smooth curve 2
fit.loess = loess(y ~ x, span = .75)
fx = predict(fit.loess, data.frame(x = x))

plot5 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot5

## line 1
fx = 40 + .4 * x

plot6 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot6

## line 2
fit.lm = lm(y~x)
fx = predict(fit.lm, data.frame(x = x))

plot7 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot7

## plot0 + stat_smooth(method = "lm")

####################################################################
## simple linear regression
####################################################################

linmod = lm(Dheight ~ Mheight, data = heights)
summary(linmod)
## 
## Call:
## lm(formula = Dheight ~ Mheight, data = heights)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.397 -1.529  0.036  1.492  9.053 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 29.91744    1.62247   18.44 <0.0000000000000002 ***
## Mheight      0.54175    0.02596   20.87 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared:  0.2408, Adjusted R-squared:  0.2402 
## F-statistic: 435.5 on 1 and 1373 DF,  p-value: < 0.00000000000000022
broom::glance(linmod)
## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
## *     <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.241         0.240  2.27      435. 3.22e-84     2 -3075. 6156. 6172.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
tidy(linmod)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   29.9      1.62        18.4 5.21e-68
## 2 Mheight        0.542    0.0260      20.9 3.22e-84
####################################################################

0.0.2 Simple linear regression and least squares properties

####################################################################
## plot for interpretation
####################################################################

set.seed(1)

data = data.frame(X = rnorm(30, 3, 3)) %>% mutate(Y = 2+.6*X +rnorm(30, 0, 1))
ggplot(data, aes(x = X, y = Y)) + 
  geom_smooth(method = 'lm', se = FALSE, size = 2) + 
  geom_point(alpha = .1) + 
  theme_bw() + 
  geom_hline(yintercept = 0) + 
  geom_vline(xintercept = 0)

####################################################################
## a few possible scatterplots
####################################################################

## plot 1
plot.data = data.frame(x = rnorm(100),
                       y = rnorm(100))

ggplot(plot.data, aes(x = x, y = y)) + geom_point()

## plot 2

plot.data = data.frame(x = rnorm(100)) %>% mutate(y = -2-2*x + rnorm(100, 0, .5))
ggplot(plot.data, aes(x = x, y = y)) + geom_point()

## plot 3
plot.data = data.frame(x = rnorm(100)) %>% mutate(y = -2+2*(x+1)^2 + rnorm(100, 0, 2))
ggplot(plot.data, aes(x = x, y = y)) + geom_point()

## plot 4
plot.data = mutate(plot.data, x = replace(x, 1, 4), y = replace(y, 1, 22), y = replace(y, order(x)[50], 15))
ggplot(plot.data, aes(x = x, y = y)) + geom_point()

####################################################################
## plot for interpretation -- this code uses "old fashioned" plotting
####################################################################

set.seed(1)

data = data.frame(X = rnorm(30, 3, 3)) %>% mutate(Y = 2+.6*X +rnorm(30, 0, 1))
data$fitted = lm(Y~X, data = data) %>% fitted

ggplot(data, aes(x = X, y = Y)) + 
  geom_smooth(method = 'lm', se = FALSE, size = 1) + 
  geom_point(color = "red", size = 2) + 
  geom_point(aes(y = fitted), size = 2) + 
  theme_bw() + 
  geom_hline(yintercept = 0) + 
  geom_vline(xintercept = 0)

####################################################################
## recreate linear model output using formulas
####################################################################

set.seed(1)

data = data.frame(x = rnorm(30, 3, 3)) %>% mutate(y = 2+.6*x +rnorm(30, 0, 1))

## make a plot
ggplot(data, aes(x = x, y = y)) + 
  geom_point(color = "red", size = 3) +
  geom_hline(aes(yintercept = mean(y)), 
             color = "blue", linetype = 2, size = 1) +
  geom_smooth(method = "lm", se = FALSE, 
              color = "blue", size = 1)

linmod = lm(y~x, data = data)
summary(linmod)
## 
## Call:
## lm(formula = y ~ x, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5202 -0.5050 -0.2297  0.5753  1.8534 
## 
## Coefficients:
##             Estimate Std. Error t value         Pr(>|t|)    
## (Intercept)  2.08743    0.22958   9.092 0.00000000075297 ***
## x            0.61396    0.05415  11.338 0.00000000000561 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8084 on 28 degrees of freedom
## Multiple R-squared:  0.8211, Adjusted R-squared:  0.8148 
## F-statistic: 128.6 on 1 and 28 DF,  p-value: 0.000000000005612
tidy(linmod)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    2.09     0.230       9.09 7.53e-10
## 2 x              0.614    0.0542     11.3  5.61e-12
glance(linmod)
## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
## *     <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.821         0.815 0.808      129. 5.61e-12     2  -35.2  76.3  80.5
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
(beta1 = sum((x - mean(x))*(y - mean(y))) / sum((x - mean(x))^2))
## [1] 0.541747
(beta0 = mean(y) - beta1*mean(x))
## [1] 29.91744
## names() tells you what's in an object
names(linmod)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
linmod$residuals
##          1          2          3          4          5          6 
##  1.2555987 -0.2398006  0.2933523 -0.2499462 -1.5201821 -0.5099489 
##          7          8          9         10         11         12 
## -0.5440273 -0.2195598  0.9465873  0.6466466 -0.3571673 -0.3990115 
##         13         14         15         16         17         18 
##  0.5936641  0.5201081 -0.8651956 -0.8349338  0.2359394  0.5996784 
##         19         20         21         22         23         24 
## -0.2760649  0.7269107  0.2302926 -0.7741079  0.2086757 -1.1753572 
##         25         26         27         28         29         30 
##  1.2777408  1.8534302 -0.4900165 -1.1118509  0.4604269 -0.2818813
linmod$fitted.values
##          1          2          3          4          5          6 
##  2.7754640  4.2675708  2.3901878  6.8676466  4.5362366  2.4181112 
##          7          8          9         10         11         12 
##  4.8271096  5.2892309  4.9898445  3.3668300  6.7138498  4.6473676 
##         13         14         15         16         17         18 
##  2.7850662 -0.1499047  6.0013156  3.8465581  3.8995001  5.6677597 
##         19         20         21         22         23         24 
##  5.4419168  5.0232194  5.6219726  5.3699269  4.0666609  0.2651611 
##         25         26         27         28         29         30 
##  5.0709693  3.8259380  3.6423631  1.2203620  3.0486227  4.6991216
## names() on the summary of linmod
names(summary(linmod))
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"
summary(linmod)$coef
##              Estimate Std. Error   t value             Pr(>|t|)
## (Intercept) 2.0874344 0.22958105  9.092364 0.000000000752971072
## x           0.6139621 0.05415004 11.338166 0.000000000005611585
summary(linmod)$r.squared
## [1] 0.821148
## ANOVA table for linmod
anova(linmod)
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value            Pr(>F)    
## x          1  84.02  84.020  128.55 0.000000000005612 ***
## Residuals 28  18.30   0.654                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1 - 18.30 / (84.02 + 18.30)
## [1] 0.8211493
####################################################################
## influence of outliers on slope
####################################################################

## setting seed ensures reproducibility of random data
set.seed(1)

## get data, estimate linear model
data = data.frame(x = rnorm(30, 3, 3)) %>% 
  mutate(y = 2+.6*x + rnorm(30, 0, 1),
         y = replace(y, 4, -2))

## make a plot
ggplot(data, aes(x = x, y = y)) + 
  geom_point(color = "red", size = 3) +
  geom_smooth(method = "lm", se = FALSE, 
              color = "blue", size = 1, linetype = 2) +
  geom_abline(intercept = coef(linmod)[1], 
              slope = coef(linmod)[2], color = "blue", size = 1)

## setting seed ensures reproducibility of random data
set.seed(1)

## get data, estimate linear model
data = data.frame(x = rnorm(30, 3, 3)) %>% 
  mutate(y = 2+.6*x + rnorm(30, 0, 1),
         y = replace(y, 23, -2))

## make a plot
ggplot(data, aes(x = x, y = y)) + 
  geom_point(color = "red", size = 3) +
  geom_smooth(method = "lm", se = FALSE, 
              color = "blue", size = 1, linetype = 2) +
  geom_abline(intercept = coef(linmod)[1], 
              slope = coef(linmod)[2], color = "blue", size = 1)

####################################################################

0.0.3 multiple linear regression

####################################################################
## multiple linear regression
####################################################################

set.seed(1)

data.mlr = data.frame(age = rnorm(40, 66, 6),
                      sex = rep(c("male", "female"), each = 20)) %>%
  mutate(weight = 6 + 2.5 * age + 20 * (sex == "male") + rnorm(40, 0, 6)) %>%
  tbl_df

ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) + 
  geom_point(size = 3) + 
  geom_smooth(method = "lm", se = FALSE) + 
  theme_bw()

## second plot; lines only
ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) + 
  geom_smooth(method = "lm", se = FALSE) + 
  theme_bw()

## description, modeling
summary(data.mlr)
##       age            sex         weight     
##  Min.   :52.71   female:20   Min.   :141.3  
##  1st Qu.:63.60   male  :20   1st Qu.:169.2  
##  Median :66.77               Median :183.2  
##  Mean   :66.55               Mean   :183.1  
##  3rd Qu.:70.47               3rd Qu.:198.9  
##  Max.   :75.57               Max.   :218.3
linmod = lm(weight ~ age + sex, data = data.mlr)
tidy(linmod)
## # A tibble: 3 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    -9.24    11.2      -0.828 4.13e- 1
## 2 age             2.74     0.168    16.3   1.81e-18
## 3 sexmale        19.9      1.77     11.3   1.53e-13
summary(linmod)
## 
## Call:
## lm(formula = weight ~ age + sex, data = data.mlr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.2249  -3.7504  -0.4383   2.4337  12.4650 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  -9.2395    11.1602  -0.828                0.413    
## age           2.7403     0.1681  16.297 < 0.0000000000000002 ***
## sexmale      19.9383     1.7665  11.287    0.000000000000153 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.551 on 37 degrees of freedom
## Multiple R-squared:  0.9224, Adjusted R-squared:  0.9182 
## F-statistic:   220 on 2 and 37 DF,  p-value: < 0.00000000000000022
head(data.mlr)
## # A tibble: 6 x 3
##     age sex   weight
##   <dbl> <fct>  <dbl>
## 1  62.2 male    181.
## 2  67.1 male    192.
## 3  61.0 male    183.
## 4  75.6 male    218.
## 5  68.0 male    192.
## 6  61.1 male    174.
model.matrix(linmod) %>% head
##   (Intercept)      age sexmale
## 1           1 62.24128       1
## 2           1 67.10186       1
## 3           1 60.98623       1
## 4           1 75.57168       1
## 5           1 67.97705       1
## 6           1 61.07719       1
tail(data.mlr)
## # A tibble: 6 x 3
##     age sex    weight
##   <dbl> <fct>   <dbl>
## 1  57.7 female   143.
## 2  63.5 female   167.
## 3  63.6 female   162.
## 4  65.6 female   170.
## 5  72.6 female   188.
## 6  70.6 female   179.
model.matrix(linmod) %>% tail
##    (Intercept)      age sexmale
## 35           1 57.73764       0
## 36           1 63.51003       0
## 37           1 63.63426       0
## 38           1 65.64412       0
## 39           1 72.60015       0
## 40           1 70.57905       0
####################################################################
## interactions
####################################################################

set.seed(1)

data.mlr = data.frame(age = rnorm(40, 66, 6),
                      sex = rep(c("male", "female"), each = 20)) %>%
  mutate(weight = -20 + 
           2.5 * age - 60 * (sex == "male") + 
           2 * (sex == "male") * age + 
           rnorm(40, 0, 6)) %>%
  tbl_df

ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) + 
  geom_point(size = 3) + 
  geom_smooth(method = "lm", se = FALSE) + 
  theme_bw()

## second plot; lines only
ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) + 
  geom_smooth(method = "lm", se = FALSE) + 
  theme_bw()

## description, modeling
summary(data.mlr)
##       age            sex         weight     
##  Min.   :52.71   female:20   Min.   :115.3  
##  1st Qu.:63.60   male  :20   1st Qu.:145.0  
##  Median :66.77               Median :170.7  
##  Mean   :66.55               Mean   :184.2  
##  3rd Qu.:70.47               3rd Qu.:224.8  
##  Max.   :75.57               Max.   :263.4
linmod = lm(weight ~ age * sex, data = data.mlr)
tidy(linmod)
## # A tibble: 4 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -33.5     16.3       -2.05 4.74e- 2
## 2 age             2.71     0.247     11.0  4.64e-13
## 3 sexmale       -63.4     22.8       -2.78 8.53e- 3
## 4 age:sexmale     2.05     0.341      6.01 6.81e- 7
head(data.mlr)
## # A tibble: 6 x 3
##     age sex   weight
##   <dbl> <fct>  <dbl>
## 1  62.2 male    199.
## 2  67.1 male    220.
## 3  61.0 male    199.
## 4  75.6 male    263.
## 5  68.0 male    222.
## 6  61.1 male    191.
model.matrix(linmod) %>% head
##   (Intercept)      age sexmale age:sexmale
## 1           1 62.24128       1    62.24128
## 2           1 67.10186       1    67.10186
## 3           1 60.98623       1    60.98623
## 4           1 75.57168       1    75.57168
## 5           1 67.97705       1    67.97705
## 6           1 61.07719       1    61.07719
tail(data.mlr)
## # A tibble: 6 x 3
##     age sex    weight
##   <dbl> <fct>   <dbl>
## 1  57.7 female   117.
## 2  63.5 female   141.
## 3  63.6 female   136.
## 4  65.6 female   144.
## 5  72.6 female   162.
## 6  70.6 female   153.
model.matrix(linmod) %>% tail
##    (Intercept)      age sexmale age:sexmale
## 35           1 57.73764       0           0
## 36           1 63.51003       0           0
## 37           1 63.63426       0           0
## 38           1 65.64412       0           0
## 39           1 72.60015       0           0
## 40           1 70.57905       0           0
####################################################################

0.0.4 Polynomial and spline models

####################################################################
## nonlinear data
####################################################################

set.seed(1)

data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
  mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))

## data plot
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() + theme_bw()

####################################################################
## fit as a polynomial
####################################################################

data.nonlin = mutate(data.nonlin, 
                     x.pow2 = x^2, x.pow3 = x^3, x.pow4 = x^4)

## fit data with quartic, plot results
quartfit = lm(y ~ x + x.pow2 + x.pow3 + x.pow4, data = data.nonlin)
broom::tidy(quartfit)
## # A tibble: 5 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    -1.21     0.184     -6.59 2.40e- 9
## 2 x             -22.0      2.39      -9.20 8.67e-15
## 3 x.pow2        130.       9.41      13.8  1.92e-24
## 4 x.pow3       -216.      13.9      -15.6  6.71e-28
## 5 x.pow4        108.       6.84      15.9  1.93e-28
mutate(data.nonlin, fitted = fitted(quartfit)) %>%
  ggplot(., aes(y=y, x=x)) + 
  geom_point() + 
  geom_line(aes(y = fitted), color = "red") + 
  theme_bw()

## plot polynomial basis functions
data.nonlin %>%
  dplyr::select(-y) %>%
  mutate(pow.0 = 1, x.pow1 = x) %>%
  gather(key = power, value = value, -x) %>%
  mutate(power = factor(power)) %>%
  ggplot(., aes(x = x, y = value)) + 
  geom_line() +
  facet_grid(. ~ power, scales = "free") +
  theme_bw()

## fit data with quadratic, plot results
quadfit = lm(y ~ x + x.pow2, data = data.nonlin)
mutate(data.nonlin, fitted = fitted(quadfit)) %>%
  ggplot(., aes(y=y, x=x)) + 
  geom_point() + 
  geom_line(aes(y = fitted), color = "red") + 
  theme_bw()

## fit data with degree 20 poly, plot results
twentyfit = lm(y ~ poly(x, 20), data = data.nonlin)
mutate(data.nonlin, fitted = fitted(twentyfit)) %>%
  ggplot(., aes(y=y, x=x)) + 
  geom_point() + 
  geom_line(aes(y = fitted), color = "red") + 
  theme_bw()

####################################################################
## fit as a piecewise linear
####################################################################

set.seed(1)

data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
  mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))

data.nonlin = mutate(data.nonlin, 
                     spline_2 = (x - .2) * (x >= .2),
                     spline_5 = (x - .5) * (x >= .5),
                     spline_8 = (x - .8) * (x >= .8))

piecewise.fit = lm(y ~ x + spline_2 + spline_5 + spline_8, data = data.nonlin)
tidy(piecewise.fit)
## # A tibble: 5 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -2.06      0.164   -12.6   6.57e-22
## 2 x              0.285     1.05      0.272 7.86e- 1
## 3 spline_2       7.86      1.36      5.76  1.04e- 7
## 4 spline_5     -16.2       0.783   -20.6   6.78e-37
## 5 spline_8       7.95      1.29      6.18  1.56e- 8
mutate(data.nonlin, fitted = fitted(piecewise.fit)) %>%
  ggplot(., aes(y=y, x=x)) + geom_point() + 
  geom_line(aes(y = fitted), color = "red") + theme_bw()

## plot piecewise basis functions
data.nonlin %>%
  dplyr::select(-y) %>%
  mutate(spline_int = 1, spline_0 = x) %>%
  gather(key = power, value = value, -x) %>%
  mutate(power = factor(power), power = relevel(power, ref = "spline_int")) %>%
  ggplot(., aes(x = x, y = value)) + geom_line() +
  facet_grid(. ~ power, scales = "free") +
  theme_bw()

####################################################################
## fit as a bspline
####################################################################

set.seed(1)

data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
  mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))

data.nonlin = data.nonlin %>% bind_cols(., data.frame(ns(.[['x']], df = 5))) %>%
  dplyr::rename(sp.1 = X1, sp.2 = X2, sp.3 = X3, sp.4 = X4, sp.5 = X5)

bspline.fit = lm(y ~ sp.1 + sp.2 + sp.3 + sp.4 + sp.5, data = data.nonlin)
tidy(bspline.fit)
## # A tibble: 6 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -1.95      0.142    -13.7  3.15e-24
## 2 sp.1           2.52      0.163     15.4  1.57e-27
## 3 sp.2           1.91      0.221      8.64 1.40e-13
## 4 sp.3          -0.365     0.158     -2.32 2.25e- 2
## 5 sp.4          -0.415     0.353     -1.17 2.44e- 1
## 6 sp.5           0.432     0.174      2.48 1.50e- 2
mutate(data.nonlin, fitted = fitted(bspline.fit)) %>%
  ggplot(., aes(y=y, x=x)) + geom_point() + 
  geom_line(aes(y = fitted), color = "red") + theme_bw()

## plot bspline basis functions
data.nonlin %>%
  dplyr::select(-y) %>%
  mutate(sp.0 = 1) %>%
  gather(key = power, value = value, -x) %>%
  ggplot(., aes(x = x, y = value)) + geom_line() +
  facet_grid(. ~ power, scales = "free") +
  theme_bw()

####################################################################

0.0.5 Identifiability and collinearity

####################################################################
## load MLB data
####################################################################

## download data file
#download.file("http://www.openintro.org/stat/data/mlb11.RData", destfile = "mlb11.RData")
load("mlb11.RData")
## load data
data(heights)

mlb11<- mlb11 %>% tbl_df
mlb11
## # A tibble: 30 x 12
##    team   runs at_bats  hits homeruns bat_avg strikeouts stolen_bases  wins
##    <fct> <int>   <int> <int>    <int>   <dbl>      <int>        <int> <int>
##  1 Texa…   855    5659  1599      210   0.283        930          143    96
##  2 Bost…   875    5710  1600      203   0.28        1108          102    90
##  3 Detr…   787    5563  1540      169   0.277       1143           49    95
##  4 Kans…   730    5672  1560      129   0.275       1006          153    71
##  5 St. …   762    5532  1513      162   0.273        978           57    90
##  6 New …   718    5600  1477      108   0.264       1085          130    77
##  7 New …   867    5518  1452      222   0.263       1138          147    97
##  8 Milw…   721    5447  1422      185   0.261       1083           94    96
##  9 Colo…   735    5544  1429      163   0.258       1201          118    73
## 10 Hous…   615    5598  1442       95   0.258       1164          118    56
## # ... with 20 more rows, and 3 more variables: new_onbase <dbl>,
## #   new_slug <dbl>, new_obs <dbl>
####################################################################
## make some plots
####################################################################

p1 = ggplot(mlb11, aes(x = runs, y = hits)) + geom_point() + 
  labs(x = "Runs", y = "Hits") + theme_bw()
p2 = ggplot(mlb11, aes(x = homeruns, y = hits)) + geom_point() + 
  labs(x = "Homeruns", y = "Hits") + theme_bw()
p3 = ggplot(mlb11, aes(x = stolen_bases, y = hits)) + geom_point() + 
  labs(x = "Stolen Bases", y = "Hits") + theme_bw()

grid.arrange(p1, p2, p3, ncol = 3)

####################################################################
## fit a model
####################################################################

linmod = lm(runs ~ at_bats + hits + homeruns + stolen_bases, data = mlb11)
tidy(linmod)
## # A tibble: 5 x 5
##   term         estimate std.error statistic      p.value
##   <chr>           <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept)   581.      526.         1.10 0.280       
## 2 at_bats        -0.202     0.117     -1.72 0.0971      
## 3 hits            0.697     0.113      6.16 0.00000191  
## 4 homeruns        1.25      0.159      7.87 0.0000000318
## 5 stolen_bases    0.523     0.169      3.10 0.00473
####################################################################
## fit a model "by hand"
####################################################################

X = cbind(1, mlb11$at_bats, mlb11$hits, mlb11$homeruns, mlb11$stolen_bases)
y = (mlb11$runs)

betaHat = solve(t(X) %*% X) %*% t(X) %*% y
betaHat
##             [,1]
## [1,] 581.2109940
## [2,]  -0.2023278
## [3,]   0.6974143
## [4,]   1.2535062
## [5,]   0.5229741
fitted = X %*% betaHat
sigmaHat = sqrt(t(y - fitted) %*% (y - fitted) / (30-4-1))
sigmaHat
##          [,1]
## [1,] 26.84777
VarBeta = as.numeric(sigmaHat^2) * (solve(t(X) %*% X))
VarBeta
##               [,1]            [,2]          [,3]          [,4]
## [1,] 277103.547951 -60.92179961738 42.6907437568 -1.1735722100
## [2,]    -60.921800   0.01377374118 -0.0108596986  0.0008804808
## [3,]     42.690744  -0.01085969863  0.0128013011 -0.0054905039
## [4,]     -1.173572   0.00088048084 -0.0054905039  0.0253823891
## [5,]     -5.003475   0.00008390983  0.0008247119  0.0017790736
##                [,5]
## [1,] -5.00347547078
## [2,]  0.00008390983
## [3,]  0.00082471190
## [4,]  0.00177907356
## [5,]  0.02843657986
sqrt(diag(VarBeta))
## [1] 526.4062575   0.1173616   0.1131428   0.1593185   0.1686315
####################################################################
## regression analyses
####################################################################

linmod = lm(Dheight~Mheight, data = heights)
tidy(linmod)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   29.9      1.62        18.4 5.21e-68
## 2 Mheight        0.542    0.0260      20.9 3.22e-84
## plot daughter height against mother height
ggplot(heights, aes(x = Mheight, y = Dheight)) + geom_point(color = "red") +
  labs(x = "Mother Height", y = "Daughter Height") +
  geom_smooth(method = "lm", se = FALSE, size = 2) + 
  theme_bw()

####################################################################
## mother's height in meters rather than inches
####################################################################

heights = mutate(heights, Mheight_m = Mheight * .0254)

linmod = lm(Dheight~Mheight_m, data = heights)
tidy(linmod)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     29.9      1.62      18.4 5.21e-68
## 2 Mheight_m       21.3      1.02      20.9 3.22e-84
## plot daughter height against mother height
ggplot(heights, aes(x = Mheight_m, y = Dheight)) + geom_point(color = "red") +
  labs(x = "Mother Height", y = "Daughter Height") +
  geom_smooth(method = "lm", se = FALSE, size = 2) + 
  theme_bw()

####################################################################
## effects of collinearity
####################################################################

linmod.col = lm(Dheight ~ Mheight + Mheight_m, data = heights)
summary(linmod.col)
## 
## Call:
## lm(formula = Dheight ~ Mheight + Mheight_m, data = heights)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.397 -1.529  0.036  1.492  9.053 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 29.91744    1.62247   18.44 <0.0000000000000002 ***
## Mheight      0.54175    0.02596   20.87 <0.0000000000000002 ***
## Mheight_m         NA         NA      NA                  NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared:  0.2408, Adjusted R-squared:  0.2402 
## F-statistic: 435.5 on 1 and 1373 DF,  p-value: < 0.00000000000000022
X = as.matrix(cbind(1, dplyr::select(heights, Mheight, Mheight_m)))
#solve(t(X) %*% X)

## some measurement error on the meters
heights = mutate(heights, Mheight_m = Mheight_m + rnorm(1375, mean = 0, sd = .01))
summarize(heights, cor = cor(Mheight, Mheight_m))
##         cor
## 1 0.9851668
X = as.matrix(cbind(1, dplyr::select(heights, Mheight, Mheight_m)))
solve(t(X) %*% X)
##                      1      Mheight   Mheight_m
## 1          0.512689796 -0.007347167 -0.03348367
## Mheight   -0.007347167  0.004456172 -0.17082049
## Mheight_m -0.033483667 -0.170820488  6.74680836
ggplot(heights, aes(x = Mheight, y=Mheight_m)) + geom_point() + 
  labs(x = "Height in inches", y = "Height in meters") + 
  theme_bw()

linmod.me = lm(Dheight ~ Mheight + Mheight_m, data = heights)
tidy(linmod.me)
## # A tibble: 3 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   29.9       1.62     18.4   6.89e-68
## 2 Mheight        0.453     0.151     2.99  2.80e- 3
## 3 Mheight_m      3.50      5.89      0.595 5.52e- 1
####################################################################

0.0.6 Inference for MLR

####################################################################
## fit a model
####################################################################

linmod = lm(runs ~ at_bats + hits + homeruns + stolen_bases, data = mlb11)
summary(linmod)
## 
## Call:
## lm(formula = runs ~ at_bats + hits + homeruns + stolen_bases, 
##     data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.868 -22.828   5.927  19.801  36.379 
## 
## Coefficients:
##              Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)  581.2110   526.4063   1.104      0.28006    
## at_bats       -0.2023     0.1174  -1.724      0.09706 .  
## hits           0.6974     0.1131   6.164 0.0000019111 ***
## homeruns       1.2535     0.1593   7.868 0.0000000318 ***
## stolen_bases   0.5230     0.1686   3.101      0.00473 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.85 on 25 degrees of freedom
## Multiple R-squared:  0.9087, Adjusted R-squared:  0.894 
## F-statistic: 62.17 on 4 and 25 DF,  p-value: 0.00000000000126
## compare to a null model with two predictors
linmod.null1 = lm(runs ~ hits + homeruns, data = mlb11)
anova(linmod.null1, linmod)
## Analysis of Variance Table
## 
## Model 1: runs ~ hits + homeruns
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
##   Res.Df   RSS Df Sum of Sq      F   Pr(>F)   
## 1     27 27128                                
## 2     25 18020  2    9107.8 6.3178 0.006015 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## compare to a null model with everything but stolen_bases
linmod.null2 = lm(runs ~ at_bats + hits + homeruns, data = mlb11)
anova(linmod.null2, linmod)
## Analysis of Variance Table
## 
## Model 1: runs ~ at_bats + hits + homeruns
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
##   Res.Df   RSS Df Sum of Sq     F   Pr(>F)   
## 1     26 24953                               
## 2     25 18020  1    6932.7 9.618 0.004728 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## compare to a null model with only the intercept
linmod.null3 = lm(runs ~ 1, data = mlb11)
anova(linmod.null3, linmod)
## Analysis of Variance Table
## 
## Model 1: runs ~ 1
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
##   Res.Df    RSS Df Sum of Sq      F           Pr(>F)    
## 1     29 197281                                         
## 2     25  18020  4    179261 62.174 0.00000000000126 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################################################################
## simulations looking at LSEs
####################################################################

## fix simulation elements
nrep = 10000
beta0 = 0
beta1 = 1
n = c(10, 100, 1000)

## matrices to hold results
NormLSEs = NonNormLSEs = data.frame(n10 = rep(NA, nrep),
                                    n100 = rep(NA, nrep),
                                    n1000 = rep(NA, nrep))

## simulate data; compute and save LSEs
for(samp.size in 1:3){
  x = rnorm(n[samp.size], 0, 1) 
    for(i in 1:nrep){
    y = beta0 + beta1 * x + rnorm(n[samp.size], 0, 1)
    NormLSEs[i,samp.size] = lm(y~x)$coef[2]
  }
}

for(samp.size in 1:3){
  x = rnorm(n[samp.size], 0, 1)    
    for(i in 1:nrep){
    y = beta0 + beta1 * x + (10/3 * (rbinom(n[samp.size], 1, .1)) - 1/3)
    NonNormLSEs[i,samp.size] = lm(y~x)$coef[2]
  }
}
## arrange and plot results of simulation for normal errors
NormLSEs = gather(NormLSEs, key = SampSize, value = LSE) %>%
  mutate(Normalized = sqrt(rep(c(10,100,1000), each = nrep)) * (LSE - 1))

p1 = ggplot(NormLSEs, aes(LSE, fill = SampSize)) + 
  geom_density(alpha = .2) +
  labs(title = "LSEs") + 
  theme_bw()
p2 = ggplot(NormLSEs, aes(Normalized, fill = SampSize)) + 
  geom_density(alpha = .2) +
  labs(title = "Normalized LSEs") + 
  theme_bw()
grid.arrange(p1, p2, ncol = 2)

## arrange and plot results of simulation for non-normal errors
NonNormLSEs = gather(NonNormLSEs, key = SampSize, value = LSE) %>%
  mutate(Normalized = sqrt(rep(c(10,100,1000), each = nrep)) * (LSE - 1))

p1 = ggplot(NonNormLSEs, aes(LSE, fill = SampSize)) + 
  geom_density(alpha = .2) +
  labs(title = "LSEs") + 
  theme_bw()
p2 = ggplot(NonNormLSEs, aes(Normalized, fill = SampSize)) + 
  geom_density(alpha = .2) +
  labs(title = "Normalized LSEs") + 
  theme_bw()
grid.arrange(p1, p2, ncol = 2)

####################################################################
## plot of FWER
####################################################################

alpha = .05
k = 1:100
fwer = 1-(1-alpha)^k
plot.dat = as.data.frame(cbind(k ,fwer))

ggplot(plot.dat, aes(x = k, y = fwer)) + 
  geom_path() + 
  geom_hline(yintercept = 1)

####################################################################
## nonlinear example
####################################################################

set.seed(1)

data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
  mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))

## data plot
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() + theme_bw()

PW.basis = sapply(seq(.1, .9, by = 0.05), 
                  function(u) (data.nonlin$x - u) * (data.nonlin$x >= u))
colnames(PW.basis) = paste0("spline_", substr(as.character(seq(.1, .9, by = 0.05)), 3, 5))
PW.basis = as.data.frame(PW.basis)
data.nonlin = bind_cols(data.nonlin, PW.basis)

## fit three models
piecewise.underfit = lm(y ~ x, data = data.nonlin)
tidy(piecewise.underfit)
## # A tibble: 2 x 5
##   term        estimate std.error statistic     p.value
##   <chr>          <dbl>     <dbl>     <dbl>       <dbl>
## 1 (Intercept)   -1.08      0.189    -5.68  0.000000139
## 2 x             -0.251     0.325    -0.772 0.442
piecewise.fit = lm(y ~ x + spline_15 + spline_5 + spline_9, data = data.nonlin)
tidy(piecewise.fit)
## # A tibble: 5 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    -1.78     0.180     -9.91 2.61e-16
## 2 x              -3.51     1.49      -2.36 2.06e- 2
## 3 spline_15      11.1      1.72       6.46 4.39e- 9
## 4 spline_5      -14.8      0.584    -25.3  5.95e-44
## 5 spline_9       20.6      2.76       7.45 4.24e-11
piecewise.overfit = lm(y ~ x + spline_1 + spline_15 + spline_2 + 
                         spline_25 + spline_3 + spline_35 + spline_4 + 
                         spline_45 + spline_5 + spline_55 + spline_6 + 
                         spline_65 + spline_7 + spline_75 + spline_8 + 
                         spline_85 + spline_9, data = data.nonlin)
tidy(piecewise.overfit)
## # A tibble: 19 x 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)   -1.78      0.241   -7.37   1.29e-10
##  2 x             -3.36      3.43    -0.981  3.29e- 1
##  3 spline_1      -5.25      9.17    -0.572  5.69e- 1
##  4 spline_15     23.0      12.5      1.84   6.87e- 2
##  5 spline_2     -12.1       9.51    -1.28   2.06e- 1
##  6 spline_25     10.6       9.73     1.09   2.77e- 1
##  7 spline_3      -7.99     12.8     -0.626  5.33e- 1
##  8 spline_35      8.50     10.9      0.776  4.40e- 1
##  9 spline_4      -9.39      8.38    -1.12   2.66e- 1
## 10 spline_45     -4.32      9.84    -0.439  6.62e- 1
## 11 spline_5       6.68     10.7      0.621  5.36e- 1
## 12 spline_55    -24.5      11.8     -2.07   4.20e- 2
## 13 spline_6      17.7      10.6      1.67   9.86e- 2
## 14 spline_65    -12.3       8.33    -1.48   1.43e- 1
## 15 spline_7       5.57      9.93     0.561  5.76e- 1
## 16 spline_75      1.12     11.4      0.0977 9.22e- 1
## 17 spline_8       0.944    10.2      0.0928 9.26e- 1
## 18 spline_85      0.843     9.91     0.0851 9.32e- 1
## 19 spline_9      14.9       7.83     1.90   6.04e- 2
## add fitted values to data
data.nonlin = mutate(data.nonlin, 
                     underfit = fitted(piecewise.underfit),
                     fit = fitted(piecewise.fit),
                     overfit = fitted(piecewise.overfit))


## f test comparing to null linear fit to alternative quartic fit
anova(piecewise.underfit, piecewise.fit)
## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ x + spline_15 + spline_5 + spline_9
##   Res.Df    RSS Df Sum of Sq     F                Pr(>F)    
## 1     98 73.444                                             
## 2     95  8.240  3    65.205 250.6 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() + 
  geom_line(aes(y = underfit), color = "red") + 
  geom_line(aes(y = fit), color = "red") + 
  theme_bw()

## f test comparing to null quartic fit to alternative twenty fit
anova(piecewise.fit, piecewise.overfit)
## Analysis of Variance Table
## 
## Model 1: y ~ x + spline_15 + spline_5 + spline_9
## Model 2: y ~ x + spline_1 + spline_15 + spline_2 + spline_25 + spline_3 + 
##     spline_35 + spline_4 + spline_45 + spline_5 + spline_55 + 
##     spline_6 + spline_65 + spline_7 + spline_75 + spline_8 + 
##     spline_85 + spline_9
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1     95 8.2395                          
## 2     81 6.7862 14    1.4533 1.239 0.2645
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() + 
  geom_line(aes(y = fit), color = "red") + 
  geom_line(aes(y = overfit), color = "red") + 
  theme_bw()

## anova table for all three
anova(piecewise.underfit, piecewise.fit, piecewise.overfit)
## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ x + spline_15 + spline_5 + spline_9
## Model 3: y ~ x + spline_1 + spline_15 + spline_2 + spline_25 + spline_3 + 
##     spline_35 + spline_4 + spline_45 + spline_5 + spline_55 + 
##     spline_6 + spline_65 + spline_7 + spline_75 + spline_8 + 
##     spline_85 + spline_9
##   Res.Df    RSS Df Sum of Sq       F              Pr(>F)    
## 1     98 73.444                                             
## 2     95  8.240  3    65.205 259.427 <0.0000000000000002 ***
## 3     81  6.786 14     1.453   1.239              0.2645    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################################################################
## mlb data example of LRTs
####################################################################

load("mlb11.RData")
head(mlb11)
##                  team runs at_bats hits homeruns bat_avg strikeouts
## 1       Texas Rangers  855    5659 1599      210   0.283        930
## 2      Boston Red Sox  875    5710 1600      203   0.280       1108
## 3      Detroit Tigers  787    5563 1540      169   0.277       1143
## 4  Kansas City Royals  730    5672 1560      129   0.275       1006
## 5 St. Louis Cardinals  762    5532 1513      162   0.273        978
## 6       New York Mets  718    5600 1477      108   0.264       1085
##   stolen_bases wins new_onbase new_slug new_obs
## 1          143   96      0.340    0.460   0.800
## 2          102   90      0.349    0.461   0.810
## 3           49   95      0.340    0.434   0.773
## 4          153   71      0.329    0.415   0.744
## 5           57   90      0.341    0.425   0.766
## 6          130   77      0.335    0.391   0.725
## fit the full model
linmod = lm(runs ~ at_bats + hits + homeruns + stolen_bases, data = mlb11)

## compare to a null model with two predictors
linmod.null1 = lm(runs ~ hits + homeruns, data = mlb11)
anova(linmod.null1, linmod)
## Analysis of Variance Table
## 
## Model 1: runs ~ hits + homeruns
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
##   Res.Df   RSS Df Sum of Sq      F   Pr(>F)   
## 1     27 27128                                
## 2     25 18020  2    9107.8 6.3178 0.006015 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## LRT
Delta = -2*(logLik(linmod.null1) - logLik(linmod))
1-pchisq(Delta, 2)
## 'log Lik.' 0.002163305 (df=4)
####################################################################
## illustration of confidence ellipses
####################################################################

library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:car':
## 
##     ellipse
## The following object is masked from 'package:graphics':
## 
##     pairs
CI.ellipse = as.data.frame(ellipse(linmod,c(2,3)))
est = as.data.frame(t(as.matrix(coef(linmod)[2:3])))

## plot the joint confidence region
ggplot(CI.ellipse, aes(x = at_bats, y = hits)) + geom_path() +
  geom_hline(yintercept = confint(linmod)[3,], linetype = 2) + 
  geom_vline(xintercept = confint(linmod)[2,], linetype = 2) +
  geom_point(data = est, size = 4)

####################################################################
## polynomial example of pointwise intervals
####################################################################

## generate data
set.seed(1)

x = runif(100, 0, 1)
x = sort(x)
y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3)
plot.dat = as.data.frame(cbind(y,x))

## fit data with quartic
quartfit = lm(y~poly(x,4))
plot.dat$quart_fit = quartfit$fitted.values

## plot of fit
plot = ggplot(plot.dat, aes(y=y, x=x)) + geom_point(size = 2) + 
  geom_line(aes(y=quart_fit, x = x), color = "red", size = 1) 
plot

## use quartfit to find variance matrix of estimated coefs
VarMat = (summary(quartfit)$sigma)^2 * summary(quartfit)$cov.unscaled

x = sort(x)
x.poly = cbind(1, poly(x, 4))

## ci's at a few new x's
fitted = x.poly[50,] %*% quartfit$coef
se = sqrt(x.poly[50,] %*% VarMat %*% x.poly[50,])
ybound = fitted +c(-2, 2) * se
## Warning in c(-2, 2) * se: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## Warning in fitted + c(-2, 2) * se: Recycling array of length 1 in array-vector arithmetic is deprecated.
##   Use c() or as.vector() instead.
plot + geom_segment(aes(x = x[50], y = ybound[1], xend = x[50], yend = ybound[2]), color = "blue", size = 1.1)

## ci's at all new x's
fitted.overall = x.poly %*% quartfit$coef
se.fitted = sqrt(diag(x.poly %*% VarMat %*% t(x.poly)))

plot.dat$UB = fitted.overall + 2 * se.fitted
plot.dat$LB = fitted.overall - 2 * se.fitted

plot + geom_segment(aes(x = x[50], y = ybound[1], xend = x[50], yend = ybound[2]), color = "blue", size = 1.1) +
  geom_line(data = plot.dat, aes(y=UB, x = x), color = "blue", linetype = 2) +
  geom_line(data = plot.dat, aes(y=LB, x = x), color = "blue", linetype = 2) 

####################################################################
## mother/daughter height to show difference in CI and PI
####################################################################

library(alr3)
data(heights)

x = heights$Mheight
y = heights$Dheight

linmod = lm(y~x)

## get pointwise CIs
x.design = cbind(1, x)
VarMat = summary(linmod)$sigma * summary(linmod)$cov.unscaled

fitted = x.design %*% linmod$coef
se.fitted = sqrt(diag(x.design %*% VarMat %*% t(x.design)))

UB = fitted + 2 * se.fitted
LB = fitted - 2 * se.fitted
plot.dat = as.data.frame(cbind(x, y, fitted, UB, LB))

plot = ggplot(plot.dat, aes(x = x, y = y)) + geom_point(color = "red") +
  geom_line(aes(x = x, y = fitted), color = "blue") + 
  geom_line(aes(x = x, y = UB), color = "blue", linetype = 2) + 
  geom_line(aes(x = x, y = LB), color = "blue", linetype = 2)


## plot prediction interval for one subject
se.pred = sqrt(summary(linmod)$sigma^2 * (1 + x.design[200,] %*% summary(linmod)$cov.unscaled %*% x.design[200,]))
ybound = fitted[200] +c(-2, 2) * se.pred
## Warning in c(-2, 2) * se.pred: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
plot + geom_segment(aes(x = x[200], y = ybound[1], 
                        xend = x[200], yend = ybound[2]), 
                    color = "blue", size = 1.1)

####################################################################

0.0.7 Resampling methods

####################################################################
## intuition for bootstrapping
####################################################################

n = 200
set.seed(5)

## generate a single sample
data.noncst = data.frame(x = rexp(n, rate = .1)) %>%
  mutate(y = 2 + 3*x + x * rnorm(n))

ggplot(data.noncst, aes(x=x, y=y)) + geom_point() + 
  geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

## generate many samples from the true distribution
p <- list()
for(i in 1:16){
  data.noncst.cur = data.frame(x = rexp(n, rate = .1)) %>%
    mutate(y = 2 + 3*x + x * rnorm(n))
  p[[i]] <- ggplot(data.noncst.cur, aes(x=x, y=y)) + geom_point() + 
    geom_smooth(method = "lm", fill = "lightblue") + theme_bw()
}
do.call(grid.arrange, p)

## generate many samples from the true distribution
beta.hat = data.frame(
  b0 = rep(NA, 1000),
  b1 = rep(NA, 1000))

for(i in 1:1000){
  data.noncst.cur = data.frame(x = rexp(n, rate = .1)) %>%
    mutate(y = 2 + 3*x + x * rnorm(n))
  beta.hat[i,] = coef(lm(y~x, data = data.noncst.cur))
}

ggplot(data.noncst, aes(x=x, y=y)) + geom_point() + 
  geom_abline(data = beta.hat, aes(intercept = b0, slope = b1), alpha = .1) + 
  geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

## implement the bootstrap on the original dataset
beta.hat = data.frame(
  b0 = rep(NA, 1000),
  b1 = rep(NA, 1000))

for(i in 1:1000){
  data.cur = sample_frac(data.noncst, size = 1, replace = TRUE)
  beta.hat[i,] = coef(lm(y~x, data.cur))
}  

ggplot(data.noncst, aes(x=x, y=y)) + geom_point() + 
  geom_abline(data = beta.hat, aes(intercept = b0, slope = b1), alpha = .1) + 
  geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

####################################################################
## prestige example for the bootstrap
####################################################################

library(car)

## fit the model
linmod = lm(prestige ~ income, data = Prestige)

## plot data and regression fit
ggplot(Prestige, aes(x = income, y = prestige)) + geom_point() + 
  geom_smooth(method = "lm") + theme_bw()

## define a vector for the bootstrapped estimates
betaHatBS = data.frame(b1.est = rep(NA, 10000))

## use a loop to do the bootstrap
for(i in 1:10000){
  data.cur = sample_frac(Prestige, size = 1, replace = TRUE)
  betaHatBS$b1.est[i] = lm(prestige ~ income, data = data.cur)$coef[2]
}

betaHatBS$b1.null = rnorm(10000, linmod$coef[2], summary(linmod)$coef[2,2])

## make a plot of bootstrap and normal-based distributions
ggplot(betaHatBS, aes(x = b1.est)) + geom_density(fill = "blue", alpha = .3) + 
  geom_density(aes(x = b1.null), fill = "red", alpha = .3) + theme_bw()

####################################################################
## permutation testing
####################################################################

n = 20
set.seed(1)

## generate a single sample
data.noncst = data.frame(x = rexp(n, rate = .1)) %>%
  mutate(y = 2 + 1 * x + x * rnorm(n))

ggplot(data.noncst, aes(x=x, y=y)) + geom_point() + 
  geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

## generate a few permuted datasets
p <- list()
for(i in 1:16){
  data.noncst.cur = mutate(data.noncst, x = sample(x, length(x), replace = FALSE))
  p[[i]] <- ggplot(data.noncst.cur, aes(x=x, y=y)) + geom_point() + 
    geom_smooth(method = "lm", fill = "lightblue") + theme_bw()
}
do.call(grid.arrange, p)

## do enough permutations to test
obs.coef = coef(lm(y ~ x, data = data.noncst))[2]

b1 = data.frame(b1.null = rep(NA, 10000))
for(i in 1:10000){
  data.noncst.cur = mutate(data.noncst, x = sample(x, length(x), replace = FALSE))
  b1$b1.null[i] = coef(lm(y ~ x, data = data.noncst.cur))[2]
}

ggplot(b1, aes(x = b1.null)) + geom_density(fill = "blue", alpha = .3) +
  geom_vline(xintercept = obs.coef, col = "red") + theme_bw()

2 * mean(b1>obs.coef)
## [1] 0.0074
####################################################################
## cross validation
####################################################################

## generate data
set.seed(1)

data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
  mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))

## code for generating an overly-rich piecewise spline basis
PW.basis = sapply(seq(.1, .9, by = 0.05), function(u) (data.nonlin$x - u) * (data.nonlin$x >= u))
colnames(PW.basis) = paste0("spline_", substr(as.character(seq(.1, .9, by = 0.05)), 3, 5))
PW.basis = as.data.frame(PW.basis)

data.nonlin = bind_cols(data.nonlin, PW.basis)

MSEs = data.frame(
  model1 = rep(NA, 100),
  model2 = rep(NA, 100),
  model3 = rep(NA, 100),
  model4 = rep(NA, 100),
  model5 = rep(NA, 100)
)

for(i in 1:100){

  set.seed(i)
  data.nonlin = mutate(data.nonlin, 
                       cv_group = sample(1:100, 100, replace = FALSE) <= 80,
                       cv_group = factor(cv_group, levels = c(TRUE, FALSE), 
                                         labels = c("train", "test")))

  data.train = filter(data.nonlin, cv_group == "train")
  data.test = filter(data.nonlin, cv_group == "test")
  
  fit.1 = lm(y ~ x, data = data.train)
  MSEs[i,1] = mean((data.test$y - predict(fit.1, newdata = data.test))^2)

  fit.2 = lm(y ~ x + spline_5, data = data.train)
  MSEs[i,2] = mean((data.test$y - predict(fit.2, newdata = data.test))^2)
  
  fit.3 = lm(y ~ x + spline_2 + spline_5 + spline_8, data = data.train)
  MSEs[i,3] = mean((data.test$y - predict(fit.3, newdata = data.test))^2)
  
  fit.4 = lm(y ~ x + spline_15 + spline_5 + spline_9, data = data.train)
  MSEs[i,4] = mean((data.test$y - predict(fit.4, newdata = data.test))^2)
  
  fit.5 = lm(y ~ x + spline_1 + spline_15 + spline_2 + spline_25 + spline_3 + spline_35 + 
               spline_4 + spline_45 + spline_5 + spline_55 + spline_6 + spline_65 + 
               spline_7 + spline_75 + spline_8 + spline_85 + spline_9, 
               data = data.train)
  MSEs[i,5] = mean((data.test$y - predict(fit.5, newdata = data.test))^2)

}

slice(MSEs, 1) %>% gather(key = fit, value = MSE, model1:model5) %>%
  ggplot(aes(x = fit, y = MSE)) + geom_point() + theme_bw()

gather(MSEs, key = fit, value = MSE, model1:model5) %>%
  ggplot(aes(x = fit, y = MSE)) + geom_violin(alpha = .4, fill = "blue") +
  theme_bw() + ylim(0, 1.5)
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).

apply(MSEs, 2, mean)
##     model1     model2     model3     model4     model5 
## 0.77698612 0.19078440 0.12511054 0.09944018 1.94061266
####################################################################

0.0.8 Regression diagnostics & Model checking

####################################################################
## fit three models with differing error structures; create
## residual and qqplots for each
####################################################################

set.seed(1)

## generate x's and y's
data.example = data.frame(x = runif(200, 0, 10)) %>%
  mutate(y1 = 1+2*x + rnorm(200),
         y2 = 1+2*x + rnorm(200) * x,
         y3 = 1+(x-4)^2 + rnorm(200, mean = 0, sd = 5)) %>%
  arrange(x)

## fit models
fit1 = lm(y1~x, data = data.example)
fit2 = lm(y2~x, data = data.example)
fit3 = lm(y3~x, data = data.example)

## save fitted values and residuals
data.example = data.example %>%
  mutate(fitted1 = fitted(fit1), resid1 = resid(fit1),
         fitted2 = fitted(fit2), resid2 = resid(fit2),
         fitted3 = fitted(fit3), resid3 = resid(fit3))

## plot data
p = list()

p[[1]] = ggplot(data.example, aes(x = x, y = y1)) + geom_point() + theme_bw()
p[[2]] = ggplot(data.example, aes(x = x, y = y2)) + geom_point() + theme_bw()
p[[3]] = ggplot(data.example, aes(x = x, y = y3)) + geom_point() + theme_bw()

grid.arrange(p[[1]], p[[2]], p[[3]], nrow = 1, ncol = 3)

## plot residuals against fitted values
p = list()

p[[1]] = ggplot(data.example, aes(x = fitted1, y = resid1)) + geom_point() + theme_bw()
p[[2]] = ggplot(data.example, aes(x = fitted2, y = resid2)) + geom_point() + theme_bw()
p[[3]] = ggplot(data.example, aes(x = fitted3, y = resid3)) + geom_point() + theme_bw()

grid.arrange(p[[1]], p[[2]], p[[3]], nrow = 1, ncol = 3)

## qqplot
p = list()

p[[1]] = ggplot(data.example, aes(sample = resid1)) + stat_qq() + theme_bw()
p[[2]] = ggplot(data.example, aes(sample = resid2)) + stat_qq() + theme_bw()
p[[3]] = ggplot(data.example, aes(sample = resid3)) + stat_qq() + theme_bw()

grid.arrange(p[[1]], p[[2]], p[[3]], nrow = 1, ncol = 3)

####################################################################
## leverage plots 1
####################################################################

set.seed(1)

data.example = data.frame(x = runif(100, 0, 10)) %>%
  mutate(y = 1+2*x + rnorm(100)) %>%
  mutate(x = replace(x, 1, 12.5), 
         y = replace(y, 1, 10))

fit = lm(y~x, data = data.example)

ggplot(data.example, aes(x=x, y=y)) + geom_point() + 
  stat_smooth(method = "lm", se = FALSE) + theme_bw()

## plot hat values
data.example = mutate(data.example,
                      hatvals = hatvalues(fit),
                      index = seq_along(hatvals))

ggplot(data.example, aes(x=index, y=hatvals)) + geom_point() + 
  ylim(0,0.09) + geom_hline(yintercept = 2*2/101) + 
  theme_bw()

####################################################################
## leverage plots 2
####################################################################

set.seed(1)

data.example = data.frame(x = runif(100, 0, 10)) %>%
  mutate(y = 1+2*x + rnorm(100)) %>%
  mutate(x = replace(x, 1, 5.5), 
         y = replace(y, 1, 3))

fit = lm(y~x, data = data.example)

ggplot(data.example, aes(x=x, y=y)) + geom_point() + 
  stat_smooth(method = "lm", se = FALSE) + theme_bw()

## plot hat values
data.example = mutate(data.example,
                      hatvals = hatvalues(fit),
                      index = seq_along(hatvals))

ggplot(data.example, aes(x=index, y=hatvals)) + geom_point() + 
  ylim(0,0.09) + geom_hline(yintercept = 2*2/101) + 
  theme_bw()

####################################################################
## leverage plots 3
####################################################################

set.seed(1)

data.example = data.frame(x = rnorm(100, 1, .5)) %>%
  mutate(y = 3 + rnorm(100)) %>%
  mutate(x = replace(x, 1, 10), 
         y = replace(y, 1, 20))

fit = lm(y~x, data = data.example)

ggplot(data.example, aes(x=x, y=y)) + geom_point() + 
  stat_smooth(method = "lm", se = FALSE) + theme_bw()

## plot hat values
data.example = mutate(data.example,
                      hatvals = hatvalues(fit),
                      index = seq_along(hatvals))

ggplot(data.example, aes(x=index, y=hatvals)) + geom_point() + 
  geom_hline(yintercept = 2*2/101) + 
  theme_bw()

####################################################################
## cook's distance
####################################################################

## plot hat values
data.example = mutate(data.example,
                      cooks = cooks.distance(fit),
                      index = seq_along(cooks))

ggplot(data.example, aes(x=index, y=cooks)) + geom_point() + 
  geom_hline(yintercept = 2*2/101) + 
  theme_bw()

####################################################################

0.0.9 Model selection

####################################################################
## load the life expectancy data and run backward selection
####################################################################

data(state)
statedata = data.frame(state.x77,row.names=state.abb)
g = lm(Life.Exp ~., data=statedata)
summary(g)
## 
## Call:
## lm(formula = Life.Exp ~ ., data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48895 -0.51232 -0.02747  0.57002  1.49447 
## 
## Coefficients:
##                   Estimate     Std. Error t value             Pr(>|t|)    
## (Intercept) 70.94322411113  1.74797537818  40.586 < 0.0000000000000002 ***
## Population   0.00005180036  0.00002918703   1.775               0.0832 .  
## Income      -0.00002180424  0.00024442561  -0.089               0.9293    
## Illiteracy   0.03382032136  0.36627989117   0.092               0.9269    
## Murder      -0.30112317045  0.04662072985  -6.459         0.0000000868 ***
## HS.Grad      0.04892947888  0.02332327770   2.098               0.0420 *  
## Frost       -0.00573500110  0.00314322966  -1.825               0.0752 .  
## Area        -0.00000007383  0.00000166816  -0.044               0.9649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared:  0.7362, Adjusted R-squared:  0.6922 
## F-statistic: 16.74 on 7 and 42 DF,  p-value: 0.0000000002534
AIC(g)
## [1] 121.7092
g = lm(Life.Exp ~ . - Area, data=statedata)
summary(g)
## 
## Call:
## lm(formula = Life.Exp ~ . - Area, data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.49047 -0.52533 -0.02546  0.57160  1.50374 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept) 70.98931852  1.38745441  51.165 < 0.0000000000000002 ***
## Population   0.00005188  0.00002879   1.802               0.0785 .  
## Income      -0.00002444  0.00023429  -0.104               0.9174    
## Illiteracy   0.02845881  0.34163295   0.083               0.9340    
## Murder      -0.30182314  0.04334432  -6.963         0.0000000145 ***
## HS.Grad      0.04847232  0.02066727   2.345               0.0237 *  
## Frost       -0.00577576  0.00297023  -1.945               0.0584 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7361 on 43 degrees of freedom
## Multiple R-squared:  0.7361, Adjusted R-squared:  0.6993 
## F-statistic: 19.99 on 6 and 43 DF,  p-value: 0.00000000005362
AIC(g)
## [1] 119.7116
g = lm(Life.Exp ~ . - (Area + Illiteracy), data=statedata)
summary(g)
## 
## Call:
## lm(formula = Life.Exp ~ . - (Area + Illiteracy), data = statedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4892 -0.5122 -0.0329  0.5645  1.5166 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept) 71.06575094  1.02894145  69.067 < 0.0000000000000002 ***
## Population   0.00005115  0.00002709   1.888               0.0657 .  
## Income      -0.00002477  0.00023160  -0.107               0.9153    
## Murder      -0.30000770  0.03704182  -8.099       0.000000000291 ***
## HS.Grad      0.04775797  0.01859079   2.569               0.0137 *  
## Frost       -0.00590986  0.00246778  -2.395               0.0210 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7277 on 44 degrees of freedom
## Multiple R-squared:  0.7361, Adjusted R-squared:  0.7061 
## F-statistic: 24.55 on 5 and 44 DF,  p-value: 0.00000000001019
AIC(g)
## [1] 117.7196
g = lm(Life.Exp ~ . - (Area + Illiteracy + Income), data=statedata)
summary(g)
## 
## Call:
## lm(formula = Life.Exp ~ . - (Area + Illiteracy + Income), data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47095 -0.53464 -0.03701  0.57621  1.50683 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept) 71.02712853  0.95285296  74.542 < 0.0000000000000002 ***
## Population   0.00005014  0.00002512   1.996              0.05201 .  
## Murder      -0.30014880  0.03660946  -8.199       0.000000000177 ***
## HS.Grad      0.04658225  0.01482706   3.142              0.00297 ** 
## Frost       -0.00594329  0.00242087  -2.455              0.01802 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared:  0.736,  Adjusted R-squared:  0.7126 
## F-statistic: 31.37 on 4 and 45 DF,  p-value: 0.000000000001696
AIC(g)
## [1] 115.7326
g = lm(Life.Exp ~ . - (Area + Illiteracy + Income + Population), data=statedata)
summary(g)
## 
## Call:
## lm(formula = Life.Exp ~ . - (Area + Illiteracy + Income + Population), 
##     data = statedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5015 -0.5391  0.1014  0.5921  1.2268 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 71.036379   0.983262  72.246 < 0.0000000000000002 ***
## Murder      -0.283065   0.036731  -7.706       0.000000000804 ***
## HS.Grad      0.049949   0.015201   3.286              0.00195 ** 
## Frost       -0.006912   0.002447  -2.824              0.00699 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7427 on 46 degrees of freedom
## Multiple R-squared:  0.7127, Adjusted R-squared:  0.6939 
## F-statistic: 38.03 on 3 and 46 DF,  p-value: 0.000000000001634
AIC(g)
## [1] 117.9743
####################################################################

0.0.10 Penalized Regression (ridge, lasso)

####################################################################
## load the life expectancy data and run a model using all variables
####################################################################

data(state)
statedata = data.frame(state.x77,row.names=state.abb)
model.full = lm(Life.Exp ~., data=statedata)
coef(model.full)
##       (Intercept)        Population            Income        Illiteracy 
## 70.94322411112951  0.00005180036383 -0.00002180423783  0.03382032135529 
##            Murder           HS.Grad             Frost              Area 
## -0.30112317045183  0.04892947888172 -0.00573500110356 -0.00000007383166
summary(model.full)
## 
## Call:
## lm(formula = Life.Exp ~ ., data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48895 -0.51232 -0.02747  0.57002  1.49447 
## 
## Coefficients:
##                   Estimate     Std. Error t value             Pr(>|t|)    
## (Intercept) 70.94322411113  1.74797537818  40.586 < 0.0000000000000002 ***
## Population   0.00005180036  0.00002918703   1.775               0.0832 .  
## Income      -0.00002180424  0.00024442561  -0.089               0.9293    
## Illiteracy   0.03382032136  0.36627989117   0.092               0.9269    
## Murder      -0.30112317045  0.04662072985  -6.459         0.0000000868 ***
## HS.Grad      0.04892947888  0.02332327770   2.098               0.0420 *  
## Frost       -0.00573500110  0.00314322966  -1.825               0.0752 .  
## Area        -0.00000007383  0.00000166816  -0.044               0.9649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared:  0.7362, Adjusted R-squared:  0.6922 
## F-statistic: 16.74 on 7 and 42 DF,  p-value: 0.0000000002534
####################################################################
## try a few ridge regression penalties
####################################################################

library(MASS)

model.ridge1 = lm.ridge(Life.Exp ~., data=statedata, lambda = 1000000)
coef(model.ridge1)
##                                Population               Income 
## 70.87833376997693335 -0.00000000102274337  0.00000003716046615 
##           Illiteracy               Murder              HS.Grad 
## -0.00006479091177417 -0.00001419577213165  0.00000483752778187 
##                Frost                 Area 
##  0.00000033830292367 -0.00000000008442886
model.ridge2 = lm.ridge(Life.Exp ~., data=statedata, lambda = .0000001)
coef(model.ridge2)
##                          Population            Income        Illiteracy 
## 70.94322410514499  0.00005180036336 -0.00002180423585  0.03382031584235 
##            Murder           HS.Grad             Frost              Area 
## -0.30112316886285  0.04892947874928 -0.00573500108468 -0.00000007383168
####################################################################
## use cross-validation to choose lambda in ridge regression
####################################################################

set.seed(12)

grid.lam = seq(-2, 14, by = .1)
lam = 2^(grid.lam)

COEFS = matrix(NA, nrow = 7, ncol = length(lam))
MSE = matrix(NA, nrow = 10, ncol = length(lam))

for(k in 1:10){
    
  statedata = mutate(statedata, 
                     cv_group = sample(1:50, 50, replace = FALSE) <= 40,
                     cv_group = factor(cv_group, levels = c(TRUE, FALSE), 
                                       labels = c("train", "test")))
 
  for(l in 1:length(lam)){
      
    data.train = filter(statedata, cv_group == "train") %>% dplyr::select(., -cv_group)
    data.test = filter(statedata, cv_group == "test") %>% dplyr::select(-cv_group)
    
    model.cur = lm.ridge(Life.Exp ~ ., data=data.train, lambda = lam[l])
    predictions = as.matrix(cbind(1, dplyr::select(data.test, -Life.Exp))) %*% 
      coef(model.cur)
    MSE[k,l] = mean(as.matrix((dplyr::select(data.test, Life.Exp)) - predictions)^2)

    COEFS[,l] = coef(lm.ridge(Life.Exp ~., 
                              data = dplyr::select(statedata, -cv_group), 
                              lambda = lam[l]))[-1]
    
  }
  
}


## plot CV curve for ridge regression
plot.dat = data.frame(grid = grid.lam, MSE = apply(MSE, 2, mean))

ggplot(plot.dat, aes(x = grid, y = MSE)) + geom_path() + 
  labs(x = expression(log[2](lambda)), ylab = "CV MSE")

## coefficient plot for ridge regression
rownames(COEFS) = colnames(dplyr::select(statedata, -Life.Exp, -cv_group))
colnames(COEFS) = grid.lam
plot.dat = melt(COEFS)

ggplot(plot.dat, aes(x = X2, y = value, group = X1, color = X1)) + geom_path() +
  labs(x = expression(log[2](lambda)), ylab = "Coefficient")

## output of final model
Lam.Final = lam[which.min(apply(MSE, 2, mean))]
model.ridge3 = lm.ridge(Life.Exp ~., data=statedata, lambda = Lam.Final)
coef(model.ridge3)
##                        Population           Income       Illiteracy 
## 70.7333872194932  0.0000399866985  0.0000072077105 -0.1172716164569 
##           Murder          HS.Grad            Frost             Area 
## -0.2510285576884  0.0430902564100 -0.0040978234118 -0.0000002492987 
##     cv_grouptest 
##  0.4499632402995
####################################################################
## try a few lasso penalties
####################################################################

rm(list = ls())

data(state)
statedata = data.frame(state.x77,row.names=state.abb)

X = as.matrix(statedata[,-4])
y = statedata[,4]

model.lasso1 = glmnet(X, y, lambda = 0.00001)
coef(model.lasso1)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                            s0
## (Intercept) 70.94187060654372
## Population   0.00005185263339
## Income      -0.00002191146801
## Illiteracy   0.03467775014259
## Murder      -0.30121570685934
## HS.Grad      0.04894537973228
## Frost       -0.00573085295669
## Area        -0.00000007370497
model.lasso2 = glmnet(X, y, lambda = 0.01)
coef(model.lasso2)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                         s0
## (Intercept) 71.01048181388
## Population   0.00004762476
## Income       .            
## Illiteracy   .            
## Murder      -0.29445650829
## HS.Grad      0.04551700912
## Frost       -0.00554215728
## Area         .
model.lasso3 = glmnet(X, y, lambda = 10)
coef(model.lasso3)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                  s0
## (Intercept) 70.8786
## Population   0.0000
## Income       .     
## Illiteracy   .     
## Murder       .     
## HS.Grad      .     
## Frost        .     
## Area         .
####################################################################
## use cross-validation to choose lambda
####################################################################

set.seed(12)

grid.lam = seq(-9, 0, by = .1)
lam = 2^(grid.lam)

GROUP = sample(rep(1:5, each = 10), 50)

COEFS = matrix(NA, nrow = 7, ncol = length(lam))
MSE = matrix(NA, nrow = 10, ncol = length(lam))

for(k in 1:10){
  
  statedata = mutate(statedata, 
                     cv_group = sample(1:50, 50, replace = FALSE) <= 40,
                     cv_group = factor(cv_group, levels = c(TRUE, FALSE), 
                                       labels = c("train", "test")))
  
  for(l in 1:length(lam)){
    
    data.train = filter(statedata, cv_group == "train") %>% dplyr::select(., -cv_group)
    data.test = filter(statedata, cv_group == "test") %>% dplyr::select(-cv_group)
    
    X.test = dplyr::select(data.train, -Life.Exp) %>% as.matrix
    y.test = dplyr::select(data.train, Life.Exp) %>% as.matrix %>% as.vector
    model.cur = glmnet(X.test, y.test, lambda = lam[l])
    
    predictions = predict(model.cur, 
                          newx = dplyr::select(data.test, -Life.Exp) %>% as.matrix)
    y.test = dplyr::select(data.test, Life.Exp) %>% as.matrix %>% as.vector
    
    MSE[k,l] = mean((y.test - predictions)^2)

    X = dplyr::select(statedata, -Life.Exp, -cv_group) %>% as.matrix
    y = dplyr::select(statedata, Life.Exp, -cv_group) %>% as.matrix %>% as.vector
    
    COEFS[,l] = coef(glmnet(X, y, lambda = lam[l]))[-1]
  }
}


## plot CV curve for ridge regression
plot.dat = data.frame(grid = grid.lam, MSE = apply(MSE, 2, mean))

ggplot(plot.dat, aes(x = grid, y = MSE)) + geom_path() + 
  labs(x = expression(log[2](lambda)), ylab = "CV MSE")

## coefficient plot for ridge regression
rownames(COEFS) = colnames(dplyr::select(statedata, -Life.Exp, -cv_group))
colnames(COEFS) = grid.lam
plot.dat = melt(COEFS)

ggplot(plot.dat, aes(x = X2, y = value, group = X1, color = X1)) + geom_path() +
  labs(x = expression(log[2](lambda)), ylab = "Coefficient")

## output of final model
Lam.Final = lam[which(apply(MSE, 2, mean) == min(apply(MSE, 2, mean)))]
model.lasso4 = glmnet(X, y, lambda = Lam.Final)
coef(model.lasso4)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                         s0
## (Intercept) 70.95395594683
## Population   0.00003900914
## Income       .            
## Illiteracy   .            
## Murder      -0.27500983918
## HS.Grad      0.04187267828
## Frost       -0.00417151783
## Area         .
####################################################################

0.0.11 Splines and Penalized Splines

####################################################################
## load lidar data
####################################################################

data(lidar)

x = lidar$range

p1 = ggplot(lidar, aes(x = range, y = logratio)) + geom_point()
p1

####################################################################
## construct spline basis and fit piecewise linear model
####################################################################

range = lidar$range
y = lidar$logratio

knots <- c(550, 625)
X.des = cbind(1, range, sapply(knots, function(k) 
                ((range - k > 0) * (range - k))))

lm.lin = lm(y ~ X.des - 1)
summary(lm.lin)
## 
## Call:
## lm(formula = y ~ X.des - 1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.294936 -0.034261  0.001713  0.039888  0.255162 
## 
## Coefficients:
##               Estimate  Std. Error t value            Pr(>|t|)    
## X.des      -0.01444288  0.06873539  -0.210               0.834    
## X.desrange -0.00008407  0.00014266  -0.589               0.556    
## X.des      -0.00704279  0.00038342 -18.368 <0.0000000000000002 ***
## X.des       0.00572319  0.00051535  11.105 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08005 on 217 degrees of freedom
## Multiple R-squared:  0.9617, Adjusted R-squared:  0.961 
## F-statistic:  1362 on 4 and 217 DF,  p-value: < 0.00000000000000022
lidar.pred = data.frame(range = range, fitted = fitted(lm.lin))
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "red")

####################################################################
## again, with different knots
####################################################################

knots <- c(500, 650)
X.des = cbind(1, range, sapply(knots, function(k) 
                ((range - k > 0) * (range - k))))

lm.lin = lm(y ~ X.des - 1)
summary(lm.lin)
## 
## Call:
## lm(formula = y ~ X.des - 1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.304857 -0.057340  0.006559  0.046501  0.255679 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## X.des      -0.4732124  0.1167565  -4.053           0.00007046 ***
## X.desrange  0.0009918  0.0002533   3.915             0.000121 ***
## X.des      -0.0052984  0.0003695 -14.340 < 0.0000000000000002 ***
## X.des       0.0027191  0.0005668   4.798           0.00000298 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09382 on 217 degrees of freedom
## Multiple R-squared:  0.9474, Adjusted R-squared:  0.9464 
## F-statistic: 976.4 on 4 and 217 DF,  p-value: < 0.00000000000000022
lidar.pred = data.frame(range = range, fitted = fitted(lm.lin))
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "red")

####################################################################
## construct spline basis and fit piecewise quadratic model
####################################################################

knots <- c(500, 600, 675)
X.des = cbind(1, range, range^2, sapply(knots, function(k) 
                ((range - k > 0) * (range - k)^2)))

lm.lin = lm(y ~ X.des - 1)
summary(lm.lin)
## 
## Call:
## lm(formula = y ~ X.des - 1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.275013 -0.038263  0.002416  0.041911  0.244874 
## 
## Coefficients:
##                Estimate   Std. Error t value             Pr(>|t|)    
## X.des       1.228093762  1.058287506   1.160               0.2472    
## X.desrange -0.005749387  0.004574109  -1.257               0.2101    
## X.des       0.000006424  0.000004905   1.310               0.1917    
## X.des      -0.000049227  0.000008055  -6.111        0.00000000459 ***
## X.des       0.000099100  0.000010115   9.798 < 0.0000000000000002 ***
## X.des      -0.000097083  0.000038415  -2.527               0.0122 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08241 on 215 degrees of freedom
## Multiple R-squared:  0.9598, Adjusted R-squared:  0.9586 
## F-statistic: 854.7 on 6 and 215 DF,  p-value: < 0.00000000000000022
lidar.pred = data.frame(range = range, fitted = fitted(lm.lin))
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "red")

####################################################################
## b-spline basis
####################################################################

library(splines)

bspline = bs(x = range, df = 6, degree = 3, intercept = TRUE)
rownames(bspline) = range
colnames(bspline) = paste0("Basis ", 1:6)

bspline.m = melt(bspline)

ggplot(bspline.m, aes(x = X1, y = value, group = X2, color = X2)) + geom_path()

####################################################################
## define CV groups, construct design matrices for training and validation data
####################################################################

set.seed(1001)

GROUP = lidar$GROUP = sample(c(0,1), size = length(y), prob = c(.8, .2), replace = TRUE)

knots <- quantile(range, seq(0, 1, length = 45))[-c(1, 45)]
X.full = cbind(1, range, sapply(knots, function(k) ((range - k > 0) * (range - k))))

X.train = X.full[which(GROUP != 1), ]; y.train = y[which(GROUP != 1)]
X.valid = X.full[which(GROUP == 1), ]; y.valid = y[which(GROUP == 1)]


####################################################################
## construct penalty matrix; estimate ridge regression for a few lambdas
####################################################################

P = diag(1, dim(X.full)[2], dim(X.full)[2])
P[1,1] = 0

lambda = 10^(-2:7)

beta.lm = solve(t(X.train) %*% X.train)%*% t(X.train) %*% y.train
beta.lam1 = solve(t(X.train) %*% X.train + lambda[4]* P)%*% t(X.train) %*% y.train
beta.lam2 = solve(t(X.train) %*% X.train + lambda[10]* P)%*% t(X.train) %*% y.train
beta.lam3 = solve(t(X.train) %*% X.train + lambda[6]* P)%*% t(X.train) %*% y.train


####################################################################
## plot results
####################################################################

p1 = ggplot(lidar, aes(x = range, y = logratio, color = as.factor(GROUP))) + 
  geom_point() +
  scale_color_manual(values = c("black", "red"), guide = "none")
p1

lidar.pred = data.frame(range = range[which(GROUP == 0)], fitted = X.train %*% beta.lm)
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

lidar.pred$fitted = X.train %*% beta.lam1
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

lidar.pred$fitted = X.train %*% beta.lam2
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

lidar.pred$fitted = X.train %*% beta.lam3
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

####################################################################
## find cross-validation curve for one fold only
####################################################################

grid.lam = seq(-2, 7, by = .2)
lambda = 10^(grid.lam)

CV = rep(NA, length(lambda))

for(l in 1:length(lambda)){
  
  beta.cur = solve(t(X.train) %*% X.train + lambda[l]* P)%*% t(X.train) %*% y.train
  CV[l] = mean((y.valid - X.valid %*% beta.cur)^2 )
  
}

## plot CV curve for ridge regression
plot.dat = data.frame(grid = grid.lam, CV = CV)

ggplot(plot.dat, aes(x = grid, y = CV)) + geom_path() + 
  labs(x = expression(log[10](lambda)), ylab = "CV MSE")

####################################################################
## kernel smoothing
####################################################################
p1 = ggplot(lidar, aes(x = range, y = logratio)) + geom_point()

kern.smooth1 = as.data.frame(ksmooth(range, y, kernel = "normal", bandwidth = 50))
p1 + geom_path(data = kern.smooth1, aes(x = x, y = y), color = "blue")

kern.smooth2 = as.data.frame(ksmooth(range, y, kernel = "normal", bandwidth = 10))
p1 + geom_path(data = kern.smooth2, aes(x = x, y = y), color = "blue")

kern.smooth2 = as.data.frame(ksmooth(range, y, kernel = "normal", bandwidth = 500))
p1 + geom_path(data = kern.smooth2, aes(x = x, y = y), color = "blue")

####################################################################

0.0.12 Additive Models

## load nepalese data
data <- read.delim("nepal.dat", header = TRUE, sep=",") 
data = data[order(data$weight),]

## divide data into training and validation data
set.seed(1)
data$GROUP = sample(c(0,1), prob = c(.2, .8), 
                    size = dim(data)[1], replace = TRUE)
data.train = subset(data, GROUP == 1)
data.valid = subset(data, GROUP == 0)


## plot arm circ against weight
p1 = ggplot(data, aes(x = weight, y = armc, 
                      color = as.factor(GROUP))) + 
  geom_point(alpha = .5) +
  scale_color_manual(values = c("red", "black"), guide = FALSE)
p1

####################################################################
## some regression analyses of armcirc vs weight
####################################################################

CV = rep(NA, 6)

## f(x) = E(y)
fx = mean(data.train$armc)

p1 + geom_hline(yintercept = fx, color = "blue", size = 1.5)

CV[1] = mean((data.valid$armc - fx)^2)


## line
fx = lm(armc~weight, data = data.train)
p1 + stat_smooth(data = data.train, 
                 method = lm, se = FALSE, 
                 color = "blue", size = 1.5)

CV[2] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)


## poly
fx = lm(armc~poly(weight, 4), data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx, 
               aes(x = weight, y = fx, color = NULL), 
               color = "blue", size = 1.5)

CV[3] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)


## spline
data.train = mutate(data.train, weight.sp = (weight>7)*(weight - 7))
data.valid = mutate(data.valid, weight.sp = (weight>7)*(weight - 7))

fx = lm(armc ~ weight + weight.sp, data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx, 
               aes(x = weight, y = fx, color = NULL), 
               color = "blue", size = 1.5)

CV[4] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)



## penalized spline
fx = mgcv::gam(armc ~ s(weight), data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx, 
               aes(x = weight, y = fx, color = NULL), 
               color = "blue", size = 1.5)

CV[5] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)


## kernel smoother
plot.fx = with(data.train, 
               as.data.frame(ksmooth(weight, armc, 
                                     kernel = "normal", 
                                     bandwidth = 2)))
p1 + geom_line(data = plot.fx, 
               aes(x = x, y = y, color = NULL), 
               color = "blue", size = 1.5)

fx.cv = with(data.train, ksmooth(weight, armc, kernel = "normal", 
                                 bandwidth = 2, 
                                 x.points = data.valid$weight)$y)

CV[6] = mean((data.valid$armc[order(data.valid$weight)] - fx.cv)^2)


## compare CV
CV = data.frame(x = 1:6, CV = CV)
ggplot(CV, aes(x = x, y = CV)) + 
  geom_point() + geom_line() + 
  labs(x = "") + 
  scale_x_continuous(breaks = 1:6, 
                     labels = c("Obs Mean", "SLR", "Poly", 
                                "Spline", "P-Spline", "Kernel"))

####################################################################
## extended case study of arm circumference
####################################################################

## penalized spline
fx = mgcv::gam(armc ~ s(weight), data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx, 
               aes(x = weight, y = fx, color = NULL), 
               color = "blue", size = 1.5)

summary(fx)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## armc ~ s(weight)
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   23.761      0.235   101.1 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df    F             p-value    
## s(weight) 3.087  3.654 3452 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.94   Deviance explained = 94.1%
## GCV = 44.533  Scale est. = 44.306    n = 802
## penalized spline -- different penalty and spline basis
fx = gam(armc ~ s(weight, k = 100), data = data.train, sp = (.0001))
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx, 
               aes(x = weight, y = fx, color = NULL), 
               color = "blue", size = 1.5)

summary(fx)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## armc ~ s(weight, k = 100)
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  23.7606     0.2498   95.13 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df     F             p-value    
## s(weight) 96.53  98.77 113.1 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.933   Deviance explained = 94.1%
## GCV = 56.964  Scale est. = 50.037    n = 802
## separate boys and girls
ggplot(data.train, aes(x = weight, y = armc, color = as.factor(sex))) + 
  geom_point(alpha = .5) +
  scale_color_manual(values = c("blue", "red"), guide = FALSE) +
  stat_smooth(method = "gam", formula = y ~ s(x), se = FALSE, size = 1.5)

fx = gam(armc ~ sex + sex * weight + s(weight), data = data.train) 
summary(fx)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## armc ~ sex + sex * weight + s(weight)
## 
## Parametric coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)  8.92615    2.76527   3.228     0.001298 ** 
## sex          0.93802    0.59137   1.586     0.113095    
## weight       0.70637    0.12325   5.731 0.0000000141 ***
## sex:weight  -0.06365    0.01726  -3.688     0.000242 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df    F        p-value    
## s(weight) 2.767  3.381 15.5 0.000000000109 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 12/13
## R-sq.(adj) =  0.941   Deviance explained = 94.2%
## GCV = 43.926  Scale est. = 43.575    n = 802
summary(lm(armc ~ sex + sex * weight, data = data.train))
## 
## Call:
## lm(formula = armc ~ sex + sex * weight, data = data.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -83.139  -1.250   0.369   2.008   8.215 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  1.94813    0.94810   2.055             0.040226 *  
## sex          1.34264    0.60640   2.214             0.027104 *  
## weight       0.99946    0.02523  39.610 < 0.0000000000000002 ***
## sex:weight  -0.06403    0.01735  -3.691             0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.83 on 798 degrees of freedom
## Multiple R-squared:  0.9374, Adjusted R-squared:  0.9371 
## F-statistic:  3981 on 3 and 798 DF,  p-value: < 0.00000000000000022
plot(fx)

## separate age groups
data.train = mutate(data.train, 
                    ageGroup = cut(age, breaks = c(0, 22, 32, 40, 50, 100), 
                                   labels = 1:5))
data.valid = mutate(data.valid, 
                    ageGroup = cut(age, breaks = c(0, 22, 32, 40, 50, 100), 
                                   labels = 1:5))

ggplot(data.train, aes(x = weight, y = armc, color = ageGroup)) + 
  geom_point(alpha = .5) +
  stat_smooth(method = "gam", formula = y ~ s(x), se = FALSE, size = 1.5) + 
  scale_color_discrete(guide = FALSE)

## smooth effects of age and weight
fx = gam(armc ~ s(age) + s(weight), data = data.train) 
summary(fx)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## armc ~ s(age) + s(weight)
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  23.7606     0.2342   101.4 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df        F             p-value    
## s(age)    1.528  1.879    4.343              0.0114 *  
## s(weight) 2.694  3.234 3916.239 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.941   Deviance explained = 94.1%
## GCV = 44.293  Scale est. = 44.005    n = 802
CV[7,] = c(7, mean((data.valid$armc - predict(fx, newdata = data.valid))^2))

par(mfrow = c(1,2))
plot(fx)

## linear model for comparison
fx = lm(armc ~ age + weight, data = data.train)
summary(fx)
## 
## Call:
## lm(formula = armc ~ age + weight, data = data.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -83.299  -0.509   0.131   0.880   8.122 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  7.340770   0.548202   13.39 < 0.0000000000000002 ***
## age         -0.094170   0.012557   -7.50     0.00000000000017 ***
## weight       0.914034   0.008137  112.33 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.654 on 799 degrees of freedom
## Multiple R-squared:  0.9405, Adjusted R-squared:  0.9403 
## F-statistic:  6314 on 2 and 799 DF,  p-value: < 0.00000000000000022
CV[8,] = c(8, mean((data.valid$armc - predict(fx, newdata = data.valid))^2))


## compare CV
ggplot(CV, aes(x = x, y = CV)) + 
  geom_point() + 
  geom_line() + 
  labs(x = "") + 
  scale_x_continuous(breaks = 1:8, 
                     labels = c("Obs Mean", "SLR", "Poly", 
                                "Spline", "P-Spline", "Kernel", 
                                "Smooth", "MLR"))

####################################################################

0.0.13 Weighted and Generalized Least Squares

####################################################################
## load the nyc air quality data
####################################################################

library(alr3)
data(physics)
head(physics)
##       x   y SD
## 1 0.345 367 17
## 2 0.287 311  9
## 3 0.251 295  9
## 4 0.225 268  7
## 5 0.207 253  7
## 6 0.186 239  6
lm.physics.wls <- lm(y~x, weights=1/SD^2,data=physics)
summary(lm.physics.wls)
## 
## Call:
## lm(formula = y ~ x, data = physics, weights = 1/SD^2)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3230 -0.8842  0.0000  1.3900  2.3353 
## 
## Coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)  148.473      8.079   18.38 0.0000000791 ***
## x            530.835     47.550   11.16 0.0000037104 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.657 on 8 degrees of freedom
## Multiple R-squared:  0.9397, Adjusted R-squared:  0.9321 
## F-statistic: 124.6 on 1 and 8 DF,  p-value: 0.00000371
lm.physics.ols <- lm(y~x, data=physics)
summary(lm.physics.ols)
## 
## Call:
## lm(formula = y ~ x, data = physics)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.773  -9.319  -2.829   5.571  19.817 
## 
## Coefficients:
##             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept)   135.00      10.08    13.4 0.000000921 ***
## x             619.71      47.68    13.0 0.000001165 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.69 on 8 degrees of freedom
## Multiple R-squared:  0.9548, Adjusted R-squared:  0.9491 
## F-statistic: 168.9 on 1 and 8 DF,  p-value: 0.000001165
####################################################################

0.0.14 Longitudinal Data Analysis

####################################################################
## plot of CD4 data
#
# website for data
# http://www.biostat.jhsph.edu/~fdominic/teaching/bio655/data/data.html
####################################################################

data = foreign::read.dta("cd4.dta")

data = data %>% dplyr::rename(ID = id, month = time, cd4 = count)

ggplot(data, aes(x = month, y = cd4)) + geom_point()

ggplot(data, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + 
  geom_line(alpha = .4)

ggplot(data, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + 
  geom_line(alpha = .4) +
  geom_line(data = subset(data, ID %in% unique(data$ID)[1:10]), 
            color = "red", size = 1) 

####################################################################
## pig weight data
####################################################################

## plots

data(pig.weights)

ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) + 
  geom_point() + geom_path(alpha = .4) +
  labs(x = "Week", y = "Weight")

ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) + 
  geom_point() + geom_path(alpha = .4) +
  labs(x = "Week", y = "Weight") + 
  stat_smooth(method = "lm", aes(group = NULL), 
              se = FALSE, color = "red", size = 1.1)

## random intercept model

ranmod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)

pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) + 
  geom_point() + geom_path(alpha = .4) +
  labs(x = "Week", y = "Weight") + 
  stat_smooth(method = "lm", aes(group = NULL), 
              se = FALSE, color = "red", size = 1.1) + 
  geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

summary(ranmod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: weight ~ (1 | id.num) + num.weeks
##    Data: pig.weights
## 
## REML criterion at convergence: 2033.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7390 -0.5456  0.0184  0.5122  3.9313 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id.num   (Intercept) 15.142   3.891   
##  Residual              4.395   2.096   
## Number of obs: 432, groups:  id.num, 48
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 19.35561    0.60314   32.09
## num.weeks    6.20990    0.03906  158.97
## 
## Correlation of Fixed Effects:
##           (Intr)
## num.weeks -0.324
####################################################################

## OLS fit
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) + 
  geom_point() + geom_path(alpha = .4) +
  labs(x = "Week", y = "Weight") + 
  stat_smooth(method = "lm", aes(group = NULL), 
              se = FALSE, color = "red", size = 1.1)

## marginal model fit
marg.mod = gee::gee(weight ~ num.weeks, id = id.num,
               corstr = "exchangeable", data = pig.weights)
## Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
## running glm to get initial regression estimate
## (Intercept)   num.weeks 
##   19.355613    6.209896
summary(marg.mod)
## 
##  GEE:  GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
##  gee S-function, version 4.13 modified 98/01/27 (1998) 
## 
## Model:
##  Link:                      Identity 
##  Variance to Mean Relation: Gaussian 
##  Correlation Structure:     Exchangeable 
## 
## Call:
## gee::gee(formula = weight ~ num.weeks, id = id.num, data = pig.weights, 
##     corstr = "exchangeable")
## 
## Summary of Residuals:
##         Min          1Q      Median          3Q         Max 
## -11.9050926  -2.5347801  -0.1951968   2.5949074  13.1751157 
## 
## 
## Coefficients:
##              Estimate Naive S.E.   Naive z Robust S.E. Robust z
## (Intercept) 19.355613  0.5983680  32.34734  0.39963854 48.43280
## num.weeks    6.209896  0.0393321 157.88366  0.09107443 68.18485
## 
## Estimated Scale Parameter:  19.29006
## Number of Iterations:  1
## 
## Working Correlation
##            [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
##  [1,] 1.0000000 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
##  [2,] 0.7690313 1.0000000 0.7690313 0.7690313 0.7690313 0.7690313
##  [3,] 0.7690313 0.7690313 1.0000000 0.7690313 0.7690313 0.7690313
##  [4,] 0.7690313 0.7690313 0.7690313 1.0000000 0.7690313 0.7690313
##  [5,] 0.7690313 0.7690313 0.7690313 0.7690313 1.0000000 0.7690313
##  [6,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 1.0000000
##  [7,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
##  [8,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
##  [9,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
##            [,7]      [,8]      [,9]
##  [1,] 0.7690313 0.7690313 0.7690313
##  [2,] 0.7690313 0.7690313 0.7690313
##  [3,] 0.7690313 0.7690313 0.7690313
##  [4,] 0.7690313 0.7690313 0.7690313
##  [5,] 0.7690313 0.7690313 0.7690313
##  [6,] 0.7690313 0.7690313 0.7690313
##  [7,] 1.0000000 0.7690313 0.7690313
##  [8,] 0.7690313 1.0000000 0.7690313
##  [9,] 0.7690313 0.7690313 1.0000000
## random effects model fit
ranmod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)

pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) + 
  geom_point() + geom_path(alpha = .4) +
  labs(x = "Week", y = "Weight") + 
  stat_smooth(method = "lm", aes(group = NULL), se = FALSE, color = "red", size = 1.1) + 
  geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

summary(ranmod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: weight ~ (1 | id.num) + num.weeks
##    Data: pig.weights
## 
## REML criterion at convergence: 2033.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7390 -0.5456  0.0184  0.5122  3.9313 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id.num   (Intercept) 15.142   3.891   
##  Residual              4.395   2.096   
## Number of obs: 432, groups:  id.num, 48
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 19.35561    0.60314   32.09
## num.weeks    6.20990    0.03906  158.97
## 
## Correlation of Fixed Effects:
##           (Intr)
## num.weeks -0.324
ranef = data.frame(int = ranef(ranmod)$id.num[,1])
ggplot(ranef, aes(x = int)) + geom_histogram(binwidth = 2, color = "blue", alpha = .4)

####################################################################

0.0.15 Random slope models

####################################################################
## pig weight data example of fixed vs random effects
####################################################################

data(pig.weights)

X.des = model.matrix( ~ -1 + as.factor(id.num) + num.weeks, data = pig.weights)
lm.fixed = lm(weight ~ -1 + X.des, data = pig.weights)
fixed = coef(lm.fixed)[1:48] - mean(coef(lm.fixed)[1:48])

ranef.mod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)
random = ranef(ranef.mod)$id.num[,1]

plot.dat = data.frame(value = c(fixed, random), 
                      yval = rep(c(1,0), each = 48), 
                      pair = rep(1:48, 2))

ggplot(plot.dat, aes(x = value, y = yval, group = pair)) + 
  geom_line() + 
  geom_point() + 
  geom_vline(xintercept = 0, color = "red", size = 1.2) + 
  theme_bw() + 
  scale_y_continuous(breaks = c(0,1), labels = c("random", "fixed")) + 
  labs(y = "", x = "")

####################################################################
## pig weight data
####################################################################

data(pig.weights)

## plot of data

## random effects model fit
ranmod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)

pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) + 
  geom_point() + geom_path(alpha = .4) +
  labs(x = "Week", y = "Weight") + 
  stat_smooth(method = "lm", aes(group = NULL), 
              se = FALSE, color = "red", size = 1.1) + 
  geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

summary(ranmod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: weight ~ (1 | id.num) + num.weeks
##    Data: pig.weights
## 
## REML criterion at convergence: 2033.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7390 -0.5456  0.0184  0.5122  3.9313 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id.num   (Intercept) 15.142   3.891   
##  Residual              4.395   2.096   
## Number of obs: 432, groups:  id.num, 48
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 19.35561    0.60314   32.09
## num.weeks    6.20990    0.03906  158.97
## 
## Correlation of Fixed Effects:
##           (Intr)
## num.weeks -0.324
ranef = data.frame(int = ranef(ranmod)$id.num[,1])
ggplot(ranef, aes(x = int)) + 
  geom_histogram(binwidth = 2, color = "blue", alpha = .4)

## random slope model fit
ranmod = lmer(weight ~ (1 + num.weeks | id.num) + num.weeks, data = pig.weights)
pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) + 
  geom_point() + geom_path(alpha = .4) +
  labs(x = "Week", y = "Weight") + 
  stat_smooth(method = "lm", aes(group = NULL), 
              se = FALSE, color = "red", size = 1.1) + 
  geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

ranef = data.frame(int = ranef(ranmod)$id.num[,1], 
                   slope = ranef(ranmod)$id.num[,2])
ggplot(ranef, aes(x = int, y = slope)) + geom_point()

####################################################################
## CD4 data example
####################################################################

## load and plot data
cd4 = foreign::read.dta("cd4.dta")
cd4 = cd4 %>% dplyr::rename(ID = id, month = time, cd4 = count)

IDS = unique(cd4$ID)[which(table(cd4$ID) != 1)]
cd4 = cd4 %>% filter(ID %in% IDS)

ggplot(cd4, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + 
  geom_line(alpha = .4)

## OLS models
## linear first
lin.mod = lm(cd4 ~ month, data = cd4)

ggplot(data, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + geom_line(alpha = .4) + 
  stat_smooth(method = "lm", aes(group = NULL), 
              se = FALSE, color = "red", size = 1.1) 

summary(lin.mod)
## 
## Call:
## lm(formula = cd4 ~ month, data = cd4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -734.49 -252.26  -65.36  174.91 2322.91 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  839.398      8.147  103.03 <0.0000000000000002 ***
## month        -89.027      3.965  -22.45 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 362.9 on 2369 degrees of freedom
## Multiple R-squared:  0.1754, Adjusted R-squared:  0.1751 
## F-statistic: 504.1 on 1 and 2369 DF,  p-value: < 0.00000000000000022
AIC(lin.mod)
## [1] 34682.64
## spline

bs.mod = lm(cd4 ~ bs(month, 5), data = cd4)

ggplot(cd4, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + geom_line(alpha = .4) + 
  stat_smooth(method = "lm", formula = y ~ bs(x, 5), 
              aes(group = NULL), se = FALSE, 
              color = "red", size = 1.1) 

summary(bs.mod)
## 
## Call:
## lm(formula = cd4 ~ bs(month, 5), data = cd4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -806.49 -238.46  -61.03  170.08 2261.82 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     898.19      57.65  15.580 < 0.0000000000000002 ***
## bs(month, 5)1   181.75     101.47   1.791              0.07340 .  
## bs(month, 5)2   154.21      57.27   2.693              0.00713 ** 
## bs(month, 5)3  -544.31      84.28  -6.459       0.000000000128 ***
## bs(month, 5)4  -230.25      80.16  -2.873              0.00411 ** 
## bs(month, 5)5  -400.92      98.69  -4.063       0.000050130841 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 356.2 on 2365 degrees of freedom
## Multiple R-squared:  0.2068, Adjusted R-squared:  0.2051 
## F-statistic: 123.3 on 5 and 2365 DF,  p-value: < 0.00000000000000022
AIC(bs.mod)
## [1] 34598.69
## random intercept
ranint.mod = lmer(cd4 ~ (1 | ID) + month, data = cd4)
cd4$ranint.mod = fitted(ranint.mod)
fitted = data.frame(month = cd4$month, 
                    fitted = getME(ranint.mod,'X') %*% fixef(ranint.mod))

ggplot(cd4, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + geom_line(alpha = .2) + 
  geom_line(data = fitted, aes(group = NULL, y = fitted), 
            color = "red", size = 1.1) + 
  geom_line(aes(y = ranint.mod), color = "blue", alpha = .5)

summary(ranint.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 | ID) + month
##    Data: cd4
## 
## REML criterion at convergence: 33747.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6240 -0.5538 -0.0664  0.4593  7.3603 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  ID       (Intercept) 64982    254.9   
##  Residual             66532    257.9   
## Number of obs: 2371, groups:  ID, 364
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  838.925     14.725   56.97
## month        -99.703      3.449  -28.91
## 
## Correlation of Fixed Effects:
##       (Intr)
## month -0.151
AIC(ranint.mod)
## [1] 33755.67
## random slope
ranslope.mod = lmer(cd4 ~ (1 + month | ID) + month, data = cd4)
cd4$ranslope.mod = fitted(ranslope.mod)
fitted = data.frame(month = cd4$month, 
                    fitted = getME(ranslope.mod,'X') %*% fixef(ranslope.mod))

ggplot(cd4, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + geom_line(alpha = .2) + 
  geom_line(data = fitted, aes(group = NULL, y = fitted), 
            color = "red", size = 1.1) + 
  geom_line(aes(y = ranslope.mod), color = "blue", alpha = .5)

summary(ranslope.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 + month | ID) + month
##    Data: cd4
## 
## REML criterion at convergence: 33601
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4726 -0.5122 -0.0678  0.4270  7.4983 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  ID       (Intercept) 70529    265.57        
##           month        5079     71.27   -0.40
##  Residual             54224    232.86        
## Number of obs: 2371, groups:  ID, 364
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  840.524     15.224   55.21
## month       -106.416      5.207  -20.44
## 
## Correlation of Fixed Effects:
##       (Intr)
## month -0.339
AIC(ranslope.mod)
## [1] 33612.96
## random intercept, slope + bs
ranbs.mod = lmer(cd4 ~ (1 + month | ID) + bs(month, 5), data = cd4)
cd4$ranbs.mod = fitted(ranbs.mod)
fitted = data.frame(month = cd4$month, 
                    fitted = getME(ranbs.mod,'X') %*% fixef(ranbs.mod))

ggplot(cd4, aes(x = month, y = cd4, group = ID)) + 
  geom_point() + geom_line(alpha = .2) + 
  geom_line(data = fitted, aes(group = NULL, y = fitted), 
            color = "red", size = 1.1) + 
  geom_line(aes(y = ranbs.mod), color = "blue", alpha = .5)

summary(ranbs.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 + month | ID) + bs(month, 5)
##    Data: cd4
## 
## REML criterion at convergence: 33408.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.6873 -0.5043 -0.0494  0.4443  7.5187 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  ID       (Intercept) 71333    267.08        
##           month        4851     69.65   -0.43
##  Residual             50484    224.69        
## Number of obs: 2371, groups:  ID, 364
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)     910.11      50.09  18.169
## bs(month, 5)1   157.54      73.43   2.145
## bs(month, 5)2   164.87      47.42   3.477
## bs(month, 5)3  -588.42      64.43  -9.133
## bs(month, 5)4  -264.07      65.57  -4.027
## bs(month, 5)5  -609.54      82.14  -7.421
## 
## Correlation of Fixed Effects:
##             (Intr) b(,5)1 b(,5)2 b(,5)3 b(,5)4
## bs(mnth,5)1 -0.788                            
## bs(mnth,5)2 -0.791  0.485                     
## bs(mnth,5)3 -0.835  0.831  0.471              
## bs(mnth,5)4 -0.652  0.404  0.738  0.332       
## bs(mnth,5)5 -0.632  0.574  0.401  0.703  0.223
summary(ranbs.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 + month | ID) + bs(month, 5)
##    Data: cd4
## 
## REML criterion at convergence: 33408.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.6873 -0.5043 -0.0494  0.4443  7.5187 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  ID       (Intercept) 71333    267.08        
##           month        4851     69.65   -0.43
##  Residual             50484    224.69        
## Number of obs: 2371, groups:  ID, 364
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)     910.11      50.09  18.169
## bs(month, 5)1   157.54      73.43   2.145
## bs(month, 5)2   164.87      47.42   3.477
## bs(month, 5)3  -588.42      64.43  -9.133
## bs(month, 5)4  -264.07      65.57  -4.027
## bs(month, 5)5  -609.54      82.14  -7.421
## 
## Correlation of Fixed Effects:
##             (Intr) b(,5)1 b(,5)2 b(,5)3 b(,5)4
## bs(mnth,5)1 -0.788                            
## bs(mnth,5)2 -0.791  0.485                     
## bs(mnth,5)3 -0.835  0.831  0.471              
## bs(mnth,5)4 -0.652  0.404  0.738  0.332       
## bs(mnth,5)5 -0.632  0.574  0.401  0.703  0.223
AIC(ranbs.mod)
## [1] 33428.49
####################################################################

0.0.16 Multilevel models

####################################################################
## two simulated data examples
####################################################################

I = 100
J = 5

Z.des = matrix(c(rep(1, J), rep(0, I*J)), nrow = I*J, ncol = I)
## Warning in matrix(c(rep(1, J), rep(0, I * J)), nrow = I * J, ncol = I):
## data length [505] is not a sub-multiple or multiple of the number of rows
## [500]
bi = rnorm(I, 0, 25)
eps = rnorm(I*J, 0, 1)
yij = Z.des%*% bi + eps 

plot(Z.des%*% bi + eps, xlab = "ij", ylab = expression(y_[ij]), pch = 18)

bi = rnorm(I, 0, 1)
yij = Z.des%*% bi + eps 
plot(Z.des%*% bi + eps, xlab = "ij", ylab = expression(y_[ij]), pch = 18)

subjs = Z.des %*% (1:I)

lmer(yij ~ (1 | subjs))
## Linear mixed model fit by REML ['lmerMod']
## Formula: yij ~ (1 | subjs)
## REML criterion at convergence: 1668.834
## Random effects:
##  Groups   Name        Std.Dev.
##  subjs    (Intercept) 0.8999  
##  Residual             1.1077  
## Number of obs: 500, groups:  subjs, 100
## Fixed Effects:
## (Intercept)  
##      -0.191
####################################################################
## three level data
####################################################################

set.seed(121)

I = 20
J = 10
K = 10

ZI.des = matrix(c(rep(1, J*K), rep(0, I*J*K)), nrow = I*J*K, ncol = I)
## Warning in matrix(c(rep(1, J * K), rep(0, I * J * K)), nrow = I * J * K, :
## data length [2100] is not a sub-multiple or multiple of the number of rows
## [2000]
ZIJ.des = matrix(c(rep(1, K), rep(0, I*J*K)), nrow = I*J*K, ncol = I*J)
## Warning in matrix(c(rep(1, K), rep(0, I * J * K)), nrow = I * J * K, ncol =
## I * : data length [2010] is not a sub-multiple or multiple of the number of
## rows [2000]
bi = rnorm(I, 0, 50)
bij = rnorm(I*J, 0, 25)
eps = rnorm(I*J, 0, 1)
yij = ZI.des %*% bi + ZIJ.des%*% bij + eps 

dev.new(width = 7, height = 3.5)
par(mai = c(.7,.7,.2,.2))
plot(yij, xlab = "ij", ylab = expression(y_[ij]), pch = 18)
abline(v = c(J*K * (0:I)), lty = 2)

L1 = ZI.des %*% (1:I)
L2 = ZIJ.des %*% (1:(I*J))

nested.mod = lmer(yij ~ (1 | L1) + (1 | L2))
summary(nested.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: yij ~ (1 | L1) + (1 | L2)
## 
## REML criterion at convergence: 7464.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.84231 -0.67622 -0.02368  0.67047  2.54767 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  L2       (Intercept)  527.003 22.957  
##  L1       (Intercept) 2137.465 46.233  
##  Residual                1.004  1.002  
## Number of obs: 2000, groups:  L2, 200; L1, 20
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   0.7712    10.4646   0.074
####################################################################
## effect of bayes
####################################################################

set.seed(2008)

grid = seq(-5, 25, length = 1000)

n = 10

l.var = 25
l.mean = 20
y = rnorm(n, mean = l.mean, sd = sqrt(l.var))
like = dnorm(grid, mean = mean(y), sd = sqrt(l.var/n))

p.var = 50
p.mean = 2
prior = dnorm(grid, mean = p.mean, sd = sqrt(p.var))

post.mean = mean(y) * p.var / (p.var + l.var/n) + p.mean * (l.var/n) / (p.var + l.var/n)
post.var = (p.var * l.var / n) / (p.var + l.var/n)
post = dnorm(grid, mean = post.mean, sd = sqrt(post.var))
plot(grid, post, type = 'l', lwd = 3, xlab = expression(mu), ylab = expression(p(mu)))
points(grid, prior, type = 'l', col = 2, lwd = 3)
points(grid, like, type = 'l', col = 4, lwd = 3)

####################################################################

0.0.17 Measurement error; mediation; confounding

####################################################################
## simulate a simple linear regression with measurement error
####################################################################

## generate data
set.seed(14)

x = rnorm(30, 3, 3)
w = x + rnorm(30,0,3)
y = 2+.6*x +rnorm(30, 0, 1)

## fit regressions
linmod.x = lm(y~x)
linmod.w = lm(y~w)

## plot data
par(mai = c(.1, .1, .1, .1))
plot(x,y, axes = FALSE, col = "black", pch = 19, 
     xlab = "", ylab = "", xlim = range(c(x, w)))
points(w,y, col = "red", pch = 19)
abline(h = 0, lwd = 2); abline(v = 0, lwd = 2)
abline(linmod.x, col = "blue", lwd = 2)
abline(linmod.w, col = "blue", lwd = 2, lty = 2)

####################################################################
## simulate a regression calibration model
####################################################################

## generate data
set.seed(14)

x = rnorm(30, 3, 3)
w = x + rnorm(30,0,3)
y = 2+.6*x +rnorm(30, 0, 1)
z = x + rnorm(30,0,2)
fitted = fitted(lm(w~z))

## fit regressions
linmod.x = lm(y~x)
linmod.w = lm(y~w)
linmod.fitted = lm(y~fitted)

## plot data
par(mai = c(.1, .1, .1, .1))
plot(x,y, axes = FALSE, col = "black", pch = 19, 
     xlab = "", ylab = "", xlim = range(c(x, w)))
points(w,y, col = "red", pch = 19)
abline(h = 0, lwd = 2); abline(v = 0, lwd = 2)
abline(linmod.x, col = "blue", lwd = 2)
abline(linmod.w, col = "blue", lwd = 2, lty = 2)
abline(linmod.fitted, col = "blue", lwd = 2, lty = 2)

####################################################################
## perform SIMEX
####################################################################

## generate data
set.seed(14)

sig2u = 9

x = rnorm(30, 3, 3)
w = x + rnorm(30, 0, sqrt(sig2u))
y = 2+.6*x +rnorm(30, 0, 1)

lam = seq(.5, 5, by = .5)
beta.lam = matrix(NA, nrow = 1000, ncol = length(lam))

for(l in 1:length(lam)){
  for(i in 1:1000){
    wb = w + rnorm(30, 0, sqrt(lam[l] * sig2u))
    beta.lam[i,l] = coef(lm(y ~ wb))[2]
  }
}

lam = c(0, lam)
mean.beta = c(coef(lm(y ~ w))[2], apply(beta.lam,2,mean))

lam2 = lam^2
lam3 = lam^3

simex.coef = coef(lm(mean.beta ~ lam + lam2 + lam3))

new.lam = seq(-1, 5, by = .1)
simex.fit = cbind(1, new.lam, new.lam^2, new.lam^3) %*% simex.coef

quartz(width = 6, height = 4)
par(mai = c(.8, .8, .1, .1))

plot(lam, mean.beta, 
     xlim = range(new.lam), 
     ylim = range(0, simex.fit), pch = 19, 
     col = c(2, rep(1, length(lam)-1 )),
     xlab = expression(lambda), 
     ylab = expression(beta[1]))
points(new.lam, simex.fit, type = 'l', lty = 2)

####################################################################

0.0.18 Logistic regression

####################################################################
## simulate some binary data
####################################################################

set.seed(11201)

beta0 = 1
beta1 = .75

x = rnorm(100, 0, 3)
pi = inv.logit(beta0 + beta1 *x)
y = rbinom(100, 1, pi)

data = data.frame(x = x, y = y)

## linear model plot
ggplot(data, aes(x = x, y = y)) + geom_point() + 
  theme_bw() + ylim(-.25, 1.25) + 
  stat_smooth(method = "lm", se = FALSE)
## Warning: Removed 10 rows containing missing values (geom_smooth).

####################################################################
## logistic regression
####################################################################

model = glm(y~x, family = binomial(link = "logit"), data = data)
summary(model)
## 
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"), data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9360  -0.4631   0.1561   0.5564   1.8131  
## 
## Coefficients:
##             Estimate Std. Error z value   Pr(>|z|)    
## (Intercept)   1.1072     0.3357   3.298   0.000974 ***
## x             0.8097     0.1664   4.865 0.00000115 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 129.49  on 99  degrees of freedom
## Residual deviance:  73.24  on 98  degrees of freedom
## AIC: 77.24
## 
## Number of Fisher Scoring iterations: 6
data = mutate(data, fitted = fitted(model), 
              fitted_logit = logit(fitted))

p1 = ggplot(data, aes(x = x, y = y)) + geom_point() + 
  theme_bw() + ylim(-.25, 1.25) + 
  geom_line(aes(x = x, y = fitted), color = "blue")
p2 = ggplot(data, aes(x = x, y = fitted_logit)) + geom_line(color = "blue") + 
  theme_bw()

grid.arrange(p1, p2, nrow = 1, ncol = 2)

####################################################################